Living-with-machines
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 4 deletions b/‎.gitignore‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎CONTRIBUTORS.txt‎
Lines changed: 1 addition & 0 deletions b/‎CONTRIBUTORS.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎DeezyMatch/CONTRIBUTORS.txt‎
Lines changed: 3 additions & 0 deletions b/‎DeezyMatch/CONTRIBUTORS.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎DeezyMatch.py‎ ‎DeezyMatch/DeezyMatch.py‎DeezyMatch.py renamed to DeezyMatch/DeezyMatch.py
Lines changed: 14 additions & 14 deletions b/‎DeezyMatch.py‎ ‎DeezyMatch/DeezyMatch.py‎DeezyMatch.py renamed to DeezyMatch/DeezyMatch.py
Lines changed: 14 additions & 14 deletions
diff --git a/‎DeezyMatch/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎DeezyMatch/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎candidateFinder.py‎ ‎DeezyMatch/candidateRanker.py‎candidateFinder.py renamed to DeezyMatch/candidateRanker.py
Lines changed: 20 additions & 11 deletions b/‎candidateFinder.py‎ ‎DeezyMatch/candidateRanker.py‎candidateFinder.py renamed to DeezyMatch/candidateRanker.py
Lines changed: 20 additions & 11 deletions
diff --git a/‎combineVecs.py‎ ‎DeezyMatch/combineVecs.py‎combineVecs.py renamed to DeezyMatch/combineVecs.py
Lines changed: 4 additions & 4 deletions b/‎combineVecs.py‎ ‎DeezyMatch/combineVecs.py‎combineVecs.py renamed to DeezyMatch/combineVecs.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎data_processing.py‎ ‎DeezyMatch/data_processing.py‎data_processing.py renamed to DeezyMatch/data_processing.py
Lines changed: 8 additions & 5 deletions b/‎data_processing.py‎ ‎DeezyMatch/data_processing.py‎data_processing.py renamed to DeezyMatch/data_processing.py
Lines changed: 8 additions & 5 deletions
diff --git a/‎rnn_networks.py‎ ‎DeezyMatch/rnn_networks.py‎rnn_networks.py renamed to DeezyMatch/rnn_networks.py
Lines changed: 13 additions & 10 deletions b/‎rnn_networks.py‎ ‎DeezyMatch/rnn_networks.py‎rnn_networks.py renamed to DeezyMatch/rnn_networks.py
Lines changed: 13 additions & 10 deletions
diff --git a/‎utils.py‎ ‎DeezyMatch/utils.py‎utils.py renamed to DeezyMatch/utils.py
Lines changed: 16 additions & 8 deletions b/‎utils.py‎ ‎DeezyMatch/utils.py‎utils.py renamed to DeezyMatch/utils.py
Lines changed: 16 additions & 8 deletions
@@ -1,8 +1,7 @@
 __pycache__
 .ipynb_checkpoints
-log.txt
-models
-pred_results.txt
 .DS_Store
 default.profraw
-log_test001.png
+DeezyMatch.egg-info
+build
+dist
@@ -0,0 +1 @@
+DeezyMatch/CONTRIBUTORS.txt
@@ -0,0 +1,3 @@
+Coll Ardanuy, Mariona
+Hosseini, Kasra
+Nanni, Federico
@@ -2,7 +2,7 @@
 # -*- coding: UTF-8 -*-
 
 """
-DeezyMatch main code: select the relevant module (train, finetune, inference, combine_vecs, candidate_finder) 
+DeezyMatch main code: select the relevant module (train, finetune, inference, combine_vecs, candidate_ranker) 
 based on the inputs.
 """
 
@@ -12,18 +12,18 @@
 import shutil
 import sys
 
-from candidateFinder import candidate_finder
-from candidateFinder import main as candidate_finder_main
-from combineVecs import combine_vecs
-from combineVecs import main as combine_vecs_main
-from data_processing import csv_split_tokenize
-from rnn_networks import gru_lstm_network, fine_tuning
-from rnn_networks import inference as rnn_inference
-from utils import deezy_mode_detector
-from utils import read_inputs_command, read_inference_command, read_input_file
-from utils import cprint, bc, log_message
+from .candidateRanker import candidate_ranker
+from .candidateRanker import main as candidate_ranker_main
+from .combineVecs import combine_vecs
+from .combineVecs import main as combine_vecs_main
+from .data_processing import csv_split_tokenize
+from .rnn_networks import gru_lstm_network, fine_tuning
+from .rnn_networks import inference as rnn_inference
+from .utils import deezy_mode_detector
+from .utils import read_inputs_command, read_inference_command, read_input_file
+from .utils import cprint, bc, log_message
 # --- set seed for reproducibility
-from utils import set_seed_everywhere
+from .utils import set_seed_everywhere
 set_seed_everywhere(1364)
 
 # ------------------- train --------------------
@@ -282,8 +282,8 @@ def main():
     elif dm_mode in ["combine_vecs"]:
         combine_vecs_main()
 
-    elif dm_mode in ["candidate_finder"]:
-        candidate_finder_main()
+    elif dm_mode in ["candidate_ranker"]:
+        candidate_ranker_main()
 
 if __name__ == '__main__':
     main()
@@ -0,0 +1,5 @@
+from DeezyMatch.DeezyMatch import train
+from DeezyMatch.DeezyMatch import finetune
+from DeezyMatch.DeezyMatch import inference
+from DeezyMatch.DeezyMatch import combine_vecs
+from DeezyMatch.DeezyMatch import candidate_ranker
@@ -19,20 +19,20 @@
 import torch
 from torch.utils.data import DataLoader
 
-from data_processing import test_tokenize
-from rnn_networks import test_model
-from utils import read_input_file
-from utils import read_command_candidate_finder
+from .data_processing import test_tokenize
+from .rnn_networks import test_model
+from .utils import read_input_file
+from .utils import read_command_candidate_ranker
 # --- set seed for reproducibility
-from utils import set_seed_everywhere
+from .utils import set_seed_everywhere
 set_seed_everywhere(1364)
 
 # skip future warnings for now XXX
 import warnings
 warnings.simplefilter(action='ignore', category=FutureWarning)
 
-# ------------------- candidate_finder --------------------
-def candidate_finder(input_file_path="default", scenario=None, ranking_metric="faiss", selection_threshold=0.8, 
+# ------------------- candidate_ranker --------------------
+def candidate_ranker(input_file_path="default", scenario=None, ranking_metric="faiss", selection_threshold=0.8, 
                      num_candidates=10, search_size=4, output_filename=None,
                      pretrained_model_path=None, pretrained_vocab_path=None, number_test_rows=-1):
 
@@ -191,10 +191,19 @@ def candidate_finder(input_file_path="default", scenario=None, ranking_metric="f
                 sys.exit(f"[ERROR] ranking_metric: {ranking_metric} is not implemented. See the documentation.")
 
             num_found_candidates += len(query_candidate_filtered_pd)
-            print("ID: %s/%s -- Number of found candidates so far: %s, search span: 0, %s" % (iq, len(vecs_query), num_found_candidates, id_1_neigh))
+            print("ID: %s/%s -- Number of found candidates so far: %s, searched: %s" % (iq+1, len(vecs_query), num_found_candidates, id_1_neigh))
 
             if num_found_candidates > 0:
                 collect_neigh_pd = collect_neigh_pd.append(query_candidate_filtered_pd)
+            
+            if ranking_metric.lower() in ["faiss"]:
+                # 1.01 is multiplied to avoid issues with float numbers and rounding erros
+                if query_candidate_pd["faiss_dist"].max() > (selection_threshold*1.01):
+                    break
+            elif ranking_metric.lower() in ["cosine"]:
+                # 0.99 is multiplied to avoid issues with float numbers and rounding errors
+                if query_candidate_pd["cosine_sim"].min() < (selection_threshold*0.99):
+                    break 
 
             # Go to the next zone    
             if (num_found_candidates < num_candidates):
@@ -223,7 +232,7 @@ def candidate_finder(input_file_path="default", scenario=None, ranking_metric="f
             mydict_candid_id[row["s2"]] = row["s2_orig_ids"]
         one_row = {
             "id": orig_id_queries, 
-            "toponym": all_queries[0], 
+            "query": all_queries[0], 
             "pred_score": [mydict_dl_match], 
             "faiss_distance": [mydict_faiss_dist], 
             "cosine_sim": [mydict_cosine_sim],
@@ -243,10 +252,10 @@ def main():
     # --- read args from the command line
     output_filename, selection_threshold, ranking_metric, search_size, num_candidates, \
         par_dir, input_file_path, number_test_rows, pretrained_model_path, pretrained_vocab_path = \
-        read_command_candidate_finder()
+        read_command_candidate_ranker()
 
     # --- 
-    candidate_finder(input_file_path=input_file_path, 
+    candidate_ranker(input_file_path=input_file_path, 
                      scenario=par_dir, 
                      ranking_metric=ranking_metric, 
                      selection_threshold=selection_threshold, 
 
@@ -22,11 +22,11 @@
 
 start_time = time.time()
 
-from utils import read_input_file
-from utils import sort_key
-from utils import read_command_combinevecs
+from .utils import read_input_file
+from .utils import sort_key
+from .utils import read_command_combinevecs
 # --- set seed for reproducibility
-from utils import set_seed_everywhere
+from .utils import set_seed_everywhere
 set_seed_everywhere(1364)
 
 # ------------------- combine_vecs --------------------
 
@@ -11,11 +11,11 @@
 import pickle
 from torch.utils.data import Dataset
 
-from utils import cprint, bc
-from utils import string_split
-from utils import normalizeString
+from .utils import cprint, bc
+from .utils import string_split
+from .utils import normalizeString
 # --- set seed for reproducibility
-from utils import set_seed_everywhere
+from .utils import set_seed_everywhere
 set_seed_everywhere(1364)
 
 
@@ -70,7 +70,10 @@ def csv_split_tokenize(dataset_path, pretrained_vocab_path=None, n_train_example
         n_total = len(rows_one_label)
 
         if n_train_examples:
-            # number of positive examples
+            # We have two sets of labels: True and False
+            # Here, we divide the number of requested rows by two
+            # This way 50% of the requested rows will be True and 50% will be False
+            # Compare this with n_train = int(train_prop * n_total) 
             n_pos = int(int(n_train_examples)/2)
             n_train = n_pos
         else:
 
@@ -36,14 +36,14 @@
 import numpy as np
 import sys
 
-from data_processing import test_tokenize
-from utils import cprint, bc, log_message
-from utils import print_stats
-from utils import torch_summarize
-from utils import create_parent_dir
-from utils import eval_map
+from .data_processing import test_tokenize
+from .utils import cprint, bc, log_message
+from .utils import print_stats
+from .utils import torch_summarize
+from .utils import create_parent_dir
+from .utils import eval_map
 # --- set seed for reproducibility
-from utils import set_seed_everywhere
+from .utils import set_seed_everywhere
 set_seed_everywhere(1364)
 
 # skip future warnings for now XXX
@@ -400,7 +400,7 @@ def test_model(model, test_dl, eval_mode='test', valid_desc=None,
     if eval_mode == 'valid':
         eval_desc = valid_desc
     elif eval_mode == 'test':
-        eval_desc = "test"
+        eval_desc = 'Epoch: 0/0; Test'
 
     t_test.set_description(eval_mode)
 
@@ -423,7 +423,9 @@ def test_model(model, test_dl, eval_mode='test', valid_desc=None,
         len2 = len2.numpy()
 
         with torch.no_grad():
-            pred = model(x1, len1, x2, len2, pooling_mode=pooling_mode, device=device, output_state_vectors=output_state_vectors, evaluation=evaluation)
+            pred = model(x1, len1, x2, len2, pooling_mode=pooling_mode, 
+                         device=device, output_state_vectors=output_state_vectors, 
+                         evaluation=evaluation)
             if output_state_vectors:
                 all_preds = []
                 continue
@@ -845,7 +847,8 @@ def inference(model_path, dataset_path, train_vocab_path, input_file_path,
                                    output_preds=dl_inputs['inference']['output_preds'],
                                    output_preds_file=output_preds_file,
                                    csv_sep=dl_inputs['preprocessing']['csv_sep'],
-                                   map_flag=dl_inputs['inference']['eval_map_metric']
+                                   map_flag=dl_inputs['inference']['eval_map_metric'],
+                                   model_path=os.path.dirname(os.path.abspath(model_path))
                                    )
 
     print("--- %s seconds ---" % (time.time() - start_time))
@@ -98,13 +98,13 @@ def deezy_mode_detector():
 
     parser = ArgumentParser()
     parser.add_argument("--deezy_mode", 
-                        help="DeezyMatch mode (options: train, finetune, inference, combine_vecs, candidate_finder)",
+                        help="DeezyMatch mode (options: train, finetune, inference, combine_vecs, candidate_ranker)",
                         default="train",
                         )
     dm_mode, unknown = parser.parse_known_args()
     dm_mode = dm_mode.deezy_mode.lower()
-    if dm_mode not in ["train", "finetune", "inference", "combine_vecs", "candidate_finder"]:
-        parser.exit(f"ERROR: implemeted modes are: train, finetune, inference, combine_vecs, candidate_finder (input: {dm_mode})")     
+    if dm_mode not in ["train", "finetune", "inference", "combine_vecs", "candidate_ranker"]:
+        parser.exit(f"ERROR: implemeted modes are: train, finetune, inference, combine_vecs, candidate_ranker (input: {dm_mode})")     
 
     return dm_mode
 
@@ -201,7 +201,7 @@ def read_inputs_command():
                     parser.exit(f"ERROR: model {fine_tuning_model_path} not found!") 
 
                 if os.path.exists(vocab_path) is False:
-                    parser.exit(f"ERROR: vocab {vocab} not found!")
+                    parser.exit(f"ERROR: vocab {vocab_path} not found!")
 
             else:
                 fine_tuning_model_name = os.path.split(fine_tuning_model)[-1]
@@ -296,13 +296,13 @@ def read_command_combinevecs():
     input_file_path = args.input_file_path
     return qc_mode, cq_sc, rnn_pass, combined_sc, input_file_path
 
-# ------------------- read_command_candidate_finder --------------------
-def read_command_candidate_finder():
+# ------------------- read_command_candidate_ranker --------------------
+def read_command_candidate_ranker():
     parser = ArgumentParser()
 
     parser.add_argument("--deezy_mode",
                     help="DeezyMatch mode",
-                    default="candidate_finder"
+                    default="candidate_ranker"
                     )
 
     parser.add_argument("-t", "--threshold",
@@ -558,7 +558,9 @@ def log_plotter(path2log, dataset="DEFAULT"):
     train_arr = []
     valid_arr = []
     time_arr = []
-    for one_line in log[3:]:
+    for one_line in log[2:]:
+        if one_line.lower().strip().startswith("python"):
+            continue
         line_split = one_line.split()
         datetime_str = line_split[0]
         epoch = int(line_split[3].split("/")[0])
@@ -655,6 +657,12 @@ def log_plotter(path2log, dataset="DEFAULT"):
     plt.subplot(3, 2, 5)
     plt.title(f"Dataset: {dataset}\nTotal time: {total_time}s, Ave. Time / epoch: {total_time/(len(time_arr)-1):.3f}s", size=16)
     plt.plot(train_arr[1:, 0], diff_time, c="k", lw=2)
+
+    # If min_valid_arg is 0 (the first model has the lowest valid loss)
+    # Increment min_valid_arg for Time as we use cumsum (lose one point in the plot)
+    if min_valid_arg == 0:
+        min_valid_arg += 1
+
     if plot_valid:
         plt.axvline(valid_arr[min_valid_arg, 0], 0, 1, ls="--", c="k")
         plt.text(valid_arr[min_valid_arg, 0]*1.05, min(diff_time)*0.98,
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Coll Ardanuy, Mariona`
	`2`	`+Hosseini, Kasra`
	`3`	`+Nanni, Federico`