From 60ee7539899a7f7823cbf9eff32f02309b750dd6 Mon Sep 17 00:00:00 2001
From: Carlo Camilloni <carlo.camilloni@gmail.com>
Date: Wed, 5 Feb 2025 17:18:24 +0100
Subject: [PATCH 1/4] make_mat.py zero and python requirements

---
 conda/environment.yml      |   2 +
 requirements.txt           |   4 +-
 tools/make_mat/make_mat.py | 179 ++++++++++++++++++++-----------------
 3 files changed, 102 insertions(+), 83 deletions(-)

diff --git a/conda/environment.yml b/conda/environment.yml
index 06c57b7e..2fd479a6 100644
--- a/conda/environment.yml
+++ b/conda/environment.yml
@@ -9,3 +9,5 @@ dependencies:
   - parmed
   - gitpython
   - pyyaml
+  - scipy
+  - h5py
diff --git a/requirements.txt b/requirements.txt
index 5b33bbc9..23b4a2a6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,6 @@ ParmEd>=3.4.3
 numpy>=1.26.2
 pandas>=2.1.3
 pyyaml>=6.0.1
-gitpython>=3.1.43
\ No newline at end of file
+gitpython>=3.1.43
+h5py>=3.11.0
+scipy>=1.13.0
diff --git a/tools/make_mat/make_mat.py b/tools/make_mat/make_mat.py
index 680d75f1..000bb2e4 100644
--- a/tools/make_mat/make_mat.py
+++ b/tools/make_mat/make_mat.py
@@ -135,19 +135,15 @@ def run_mat_(arguments):
 
         if np.isin(int(ai), protein_ref_indices_i):
             cut_i = np.where(protein_ref_indices_i == int(ai))[0][0]
+
             # column mapping
             ref_df = read_mat(ref_f, protein_ref_indices_j, args)
             ref_df.loc[len(ref_df)] = c12_cutoff[cut_i]
 
-            # calculate data
-            c12_avg_ = zero_probability_decorator(c12_avg, args.zero)
-            calculate_probability_ = zero_probability_decorator(calculate_probability, args.zero)
-            get_cumulative_probability_ = zero_probability_decorator(get_cumulative_probability, args.zero)
-
-            c12dist = ref_df.apply(lambda x: c12_avg_(ref_df.index.to_numpy(), weights=x.to_numpy()), axis=0).values
+            c12dist = ref_df.apply(lambda x: c12_avg(ref_df.index.to_numpy(), weights=x.to_numpy()), axis=0).values
             if mat_type == "intra":
                 p = ref_df.apply(
-                    lambda x: calculate_probability_(ref_df.index.to_numpy(), weights=x.to_numpy()),
+                    lambda x: calculate_probability(ref_df.index.to_numpy(), weights=x.to_numpy()),
                     axis=0,
                 ).values
             elif mat_type == "inter":
@@ -156,7 +152,7 @@ def run_mat_(arguments):
                 c_ref_df = read_mat(c_ref_f, protein_ref_indices_j, args, True)
                 c_ref_df.loc[len(c_ref_df)] = c12_cutoff[cut_i]
                 p = c_ref_df.apply(
-                    lambda x: get_cumulative_probability_(c_ref_df.index.to_numpy(), weights=x.to_numpy()),
+                    lambda x: get_cumulative_probability(c_ref_df.index.to_numpy(), weights=x.to_numpy()),
                     axis=0,
                 ).values
 
@@ -752,79 +748,90 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref
     # PARALLEL PROCESS START
     ########################
 
-    if not args.residue:
-        chunks = np.array_split(target_list, args.num_threads)
-    else:
-        chunks = []
-        n_threshold = sum([len(v) for v in dict_m_m_r.values()]) // args.num_threads
-        chunk = []
-        n = 0
-        for k, v in dict_m_m_r.items():
-            chunk.append(v)
-            n += len(v)
-            if n > n_threshold:
-                chunks.append(chunk)
-                chunk = []
-                n = 0
-        chunks.append(chunk)
-    pool = multiprocessing.Pool(args.num_threads)
-    if args.residue and not args.intra:
-        results = pool.map(
-            run_residue_inter_,
-            [
-                (
-                    args,
-                    protein_ref_indices_i,
-                    protein_ref_indices_j,
-                    len(ref_ri_to_ai_j),
-                    c12_cutoff,
-                    mol_i,
-                    mol_j,
-                    (ref_ai_to_ri_i, index_ai_to_ri_j),
-                    x,
-                )
-                for x in chunks
-            ],
-        )
+    if args.zero:
+
+        df = pd.DataFrame()
+        df["mi"] = [mol_i for _ in range(len(protein_ref_indices_i) * len(protein_ref_indices_j))]
+        df["mj"] = [mol_j for _ in range(len(protein_ref_indices_i) * len(protein_ref_indices_j))]
+        df["ai"] = np.repeat(protein_ref_indices_i, len(protein_ref_indices_j))
+        df["aj"] = np.tile(protein_ref_indices_j, len(protein_ref_indices_i))
+        df["c12dist"] = 0.0
+        df["p"] = 0.0
+        df["cutoff"] = [c12_cutoff[i, j] for i in range(len(protein_ref_indices_i)) for j in range(len(protein_ref_indices_j))]
     else:
+        if not args.residue:
+            chunks = np.array_split(target_list, args.num_threads)
+        else:
+            chunks = []
+            n_threshold = sum([len(v) for v in dict_m_m_r.values()]) // args.num_threads
+            chunk = []
+            n = 0
+            for k, v in dict_m_m_r.items():
+                chunk.append(v)
+                n += len(v)
+                if n > n_threshold:
+                    chunks.append(chunk)
+                    chunk = []
+                    n = 0
+            chunks.append(chunk)
+        pool = multiprocessing.Pool(args.num_threads)
+        if args.residue and not args.intra:
+            results = pool.map(
+                run_residue_inter_,
+                [
+                    (
+                        args,
+                        protein_ref_indices_i,
+                        protein_ref_indices_j,
+                        len(ref_ri_to_ai_j),
+                        c12_cutoff,
+                        mol_i,
+                        mol_j,
+                        (ref_ai_to_ri_i, index_ai_to_ri_j),
+                        x,
+                    )
+                    for x in chunks
+                ],
+            )
+        else:
 
-        results = pool.map(
-            run_mat_,
-            [
-                (
-                    args,
-                    protein_ref_indices_i,
-                    protein_ref_indices_j,
-                    original_size_j,
-                    c12_cutoff,
-                    mol_i,
-                    mol_j,
-                    x,
-                    mat_type,
-                )
-                for x in chunks
-            ],
-        )
+            results = pool.map(
+                run_mat_,
+                [
+                    (
+                        args,
+                        protein_ref_indices_i,
+                        protein_ref_indices_j,
+                        original_size_j,
+                        c12_cutoff,
+                        mol_i,
+                        mol_j,
+                        x,
+                        mat_type,
+                    )
+                    for x in chunks
+                ],
+            )
 
-    pool.close()
-    pool.join()
+        pool.close()
+        pool.join()
 
-    ########################
-    # PARALLEL PROCESS END
-    ########################
+        ########################
+        # PARALLEL PROCESS END
+        ########################
 
-    # concatenate and remove partial dataframes
-    for name in results:
-        try:
-            part_df = pd.read_csv(name)
-            df = pd.concat([df, part_df])
-        except pd.errors.EmptyDataError:
-            print(f"Ignoring partial dataframe in {name} as csv is empty")
-    [os.remove(name) for name in results]
-    df = df.astype({"mi": "int32", "mj": "int32", "ai": "int32", "aj": "int32"})
+        # concatenate and remove partial dataframes
+        for name in results:
+            try:
+                part_df = pd.read_csv(name)
+                df = pd.concat([df, part_df])
+            except pd.errors.EmptyDataError:
+                print(f"Ignoring partial dataframe in {name} as csv is empty")
+        [os.remove(name) for name in results]
+        df = df.astype({"mi": "int32", "mj": "int32", "ai": "int32", "aj": "int32"})
 
-    df = df.sort_values(by=["mi", "mj", "ai", "aj"])
-    df.drop_duplicates(subset=["mi", "ai", "mj", "aj"], inplace=True)
+        df = df.sort_values(by=["mi", "mj", "ai", "aj"])
+        df.drop_duplicates(subset=["mi", "ai", "mj", "aj"], inplace=True)
 
     df["mi"] = df["mi"].map("{:}".format)
     df["mj"] = df["mj"].map("{:}".format)
@@ -848,7 +855,7 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref
     parser.add_argument(
         "--histo",
         type=str,
-        required=True,
+        required=False,
         help='Path to the directory containing the histograms. The histogram files should contain the prefix "intra_" for intra molecular contact descriptions and "inter_" for  inter molecular.',
     )
     parser.add_argument(
@@ -907,18 +914,26 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref
     )
     args = parser.parse_args()
 
+    # check either histo or zero flag are set
+    if not args.histo and not args.zero:
+        raise ValueError("Either --histo or --zero flag must be set.")
+    if args.histo and args.zero:
+        raise ValueError("Both --histo and --zero flags cannot be set at the same time.")
+
     # check if output file exists
     if not os.path.exists(args.out):
         print(f"The path '{args.out}' does not exist.")
         sys.exit()
 
-    if not args.tar and not os.path.isdir(args.histo):
-        print(f"The path '{args.histo}' is not a directory.")
-        sys.exit()
+    if not args.zero and not args.tar:
+        if not os.path.isdir(args.histo):
+            print(f"The path '{args.histo}' is not a directory.")
+            sys.exit()
 
-    if args.tar and not tarfile.is_tarfile(args.histo):
-        print(f"The path '{args.histo}' is not a tar file.")
-        sys.exit()
+    if not args.zero and args.tar:
+        if not tarfile.is_tarfile(args.histo):
+            print(f"The path '{args.histo}' is not a tar file.")
+            sys.exit()
 
     if args.tar and args.h5:
         printf("cannot use --tar and --h5, chose one.")

From 328d30de2773e32a86da5c0feba4088beb0558a5 Mon Sep 17 00:00:00 2001
From: Carlo Camilloni <carlo.camilloni@gmail.com>
Date: Wed, 5 Feb 2025 17:41:27 +0100
Subject: [PATCH 2/4] make_mat fix for tar

---
 tools/make_mat/make_mat.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/make_mat/make_mat.py b/tools/make_mat/make_mat.py
index 000bb2e4..7127b1fd 100644
--- a/tools/make_mat/make_mat.py
+++ b/tools/make_mat/make_mat.py
@@ -49,6 +49,9 @@ def read_mat(name, protein_ref_indices, args, cumulative=False):
     if args.tar:
         with tarfile.open(args.histo, "r:*") as tar:
             ref_df = pd.read_csv(tar.extractfile(name), header=None, sep="\s+", usecols=[0, *protein_ref_indices])
+            ref_df_columns = ["distance", *[str(x) for x in protein_ref_indices]]
+            ref_df.columns = ref_df_columns
+            ref_df.set_index("distance", inplace=True)
     else:
         if not args.h5:
             ref_df = pd.read_csv(f"{path_prefix}/{name}", header=None, sep="\s+", usecols=[0, *protein_ref_indices])

From 7313ea47b67666bc1fbb1806415760f147cf9377 Mon Sep 17 00:00:00 2001
From: Carlo Camilloni <carlo.camilloni@gmail.com>
Date: Wed, 5 Feb 2025 18:00:06 +0100
Subject: [PATCH 3/4] make_mat test

---
 test/run_make_mat.sh       | 1 -
 tools/make_mat/make_mat.py | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/run_make_mat.sh b/test/run_make_mat.sh
index 0f8c8c54..8802a014 100755
--- a/test/run_make_mat.sh
+++ b/test/run_make_mat.sh
@@ -3,7 +3,6 @@
 set -e
 set -o pipefail
 
-tar -zxf test_inputs/make_mat_ttr/hh.tgz -C test_inputs/make_mat_ttr/
 python ../tools/make_mat/make_mat.py --histo test_inputs/make_mat_ttr/hh.tgz --target_top test_inputs/make_mat_ttr/topol_md.top --mego_top test_inputs/make_mat_ttr/topol_ref.top --cutoff 0.75 --mode intra+same --out  test_inputs/make_mat_ttr/ --tar
 diff <(gzip -dc test_inputs/make_mat_ttr/intramat_1_1.ndx.gz) <(gzip -dc test_outputs/make_mat_ttr/intramat_1_1.ndx.gz)
 diff <(gzip -dc test_inputs/make_mat_ttr/intermat_1_1.ndx.gz) <(gzip -dc test_outputs/make_mat_ttr/intermat_1_1.ndx.gz)
diff --git a/tools/make_mat/make_mat.py b/tools/make_mat/make_mat.py
index 7127b1fd..4be7d205 100644
--- a/tools/make_mat/make_mat.py
+++ b/tools/make_mat/make_mat.py
@@ -439,7 +439,8 @@ def c12_avg(values, weights):
     res = np.maximum(cutoff / 4.5, 0.1)
     log_exp_sum = logsumexp(1.0 / v / res, b=w) - np.log(norm)
     exp_aver = (1.0 / res) / log_exp_sum
-    # exp_aver = (1.0 / res) / np.log(np.sum(w * np.exp(1.0 / v / res)) / norm)
+    if exp_aver < 0.01:
+        exp_aver = 0
 
     return exp_aver
 

From 7566bec9881e87bcd4cc2e756019cb0ad73aeb40 Mon Sep 17 00:00:00 2001
From: Carlo Camilloni <carlo.camilloni@gmail.com>
Date: Wed, 5 Feb 2025 18:01:05 +0100
Subject: [PATCH 4/4] flake

---
 tools/make_mat/make_mat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/make_mat/make_mat.py b/tools/make_mat/make_mat.py
index 4be7d205..5b8b7928 100644
--- a/tools/make_mat/make_mat.py
+++ b/tools/make_mat/make_mat.py
@@ -61,7 +61,7 @@ def read_mat(name, protein_ref_indices, args, cumulative=False):
         else:
             with h5py.File(f"{path_prefix}/{name}", "r") as f:
                 if "density" not in f:
-                    raise KeyError(f"Dataset 'density' not found in {h5_file}")
+                    raise KeyError(f"Dataset 'density' not found in {name}")
 
                 data = f["density"][:]  # Read full dataset
                 # Extract the first column (distance) and the relevant protein_ref_indices columns