From 60ee7539899a7f7823cbf9eff32f02309b750dd6 Mon Sep 17 00:00:00 2001 From: Carlo Camilloni Date: Wed, 5 Feb 2025 17:18:24 +0100 Subject: [PATCH 1/4] make_mat.py zero and python requirements --- conda/environment.yml | 2 + requirements.txt | 4 +- tools/make_mat/make_mat.py | 179 ++++++++++++++++++++----------------- 3 files changed, 102 insertions(+), 83 deletions(-) diff --git a/conda/environment.yml b/conda/environment.yml index 06c57b7e..2fd479a6 100644 --- a/conda/environment.yml +++ b/conda/environment.yml @@ -9,3 +9,5 @@ dependencies: - parmed - gitpython - pyyaml + - scipy + - h5py diff --git a/requirements.txt b/requirements.txt index 5b33bbc9..23b4a2a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,6 @@ ParmEd>=3.4.3 numpy>=1.26.2 pandas>=2.1.3 pyyaml>=6.0.1 -gitpython>=3.1.43 \ No newline at end of file +gitpython>=3.1.43 +h5py>=3.11.0 +scipy>=1.13.0 diff --git a/tools/make_mat/make_mat.py b/tools/make_mat/make_mat.py index 680d75f1..000bb2e4 100644 --- a/tools/make_mat/make_mat.py +++ b/tools/make_mat/make_mat.py @@ -135,19 +135,15 @@ def run_mat_(arguments): if np.isin(int(ai), protein_ref_indices_i): cut_i = np.where(protein_ref_indices_i == int(ai))[0][0] + # column mapping ref_df = read_mat(ref_f, protein_ref_indices_j, args) ref_df.loc[len(ref_df)] = c12_cutoff[cut_i] - # calculate data - c12_avg_ = zero_probability_decorator(c12_avg, args.zero) - calculate_probability_ = zero_probability_decorator(calculate_probability, args.zero) - get_cumulative_probability_ = zero_probability_decorator(get_cumulative_probability, args.zero) - - c12dist = ref_df.apply(lambda x: c12_avg_(ref_df.index.to_numpy(), weights=x.to_numpy()), axis=0).values + c12dist = ref_df.apply(lambda x: c12_avg(ref_df.index.to_numpy(), weights=x.to_numpy()), axis=0).values if mat_type == "intra": p = ref_df.apply( - lambda x: calculate_probability_(ref_df.index.to_numpy(), weights=x.to_numpy()), + lambda x: calculate_probability(ref_df.index.to_numpy(), weights=x.to_numpy()), axis=0, ).values elif mat_type == "inter": @@ -156,7 +152,7 @@ def run_mat_(arguments): c_ref_df = read_mat(c_ref_f, protein_ref_indices_j, args, True) c_ref_df.loc[len(c_ref_df)] = c12_cutoff[cut_i] p = c_ref_df.apply( - lambda x: get_cumulative_probability_(c_ref_df.index.to_numpy(), weights=x.to_numpy()), + lambda x: get_cumulative_probability(c_ref_df.index.to_numpy(), weights=x.to_numpy()), axis=0, ).values @@ -752,79 +748,90 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref # PARALLEL PROCESS START ######################## - if not args.residue: - chunks = np.array_split(target_list, args.num_threads) - else: - chunks = [] - n_threshold = sum([len(v) for v in dict_m_m_r.values()]) // args.num_threads - chunk = [] - n = 0 - for k, v in dict_m_m_r.items(): - chunk.append(v) - n += len(v) - if n > n_threshold: - chunks.append(chunk) - chunk = [] - n = 0 - chunks.append(chunk) - pool = multiprocessing.Pool(args.num_threads) - if args.residue and not args.intra: - results = pool.map( - run_residue_inter_, - [ - ( - args, - protein_ref_indices_i, - protein_ref_indices_j, - len(ref_ri_to_ai_j), - c12_cutoff, - mol_i, - mol_j, - (ref_ai_to_ri_i, index_ai_to_ri_j), - x, - ) - for x in chunks - ], - ) + if args.zero: + + df = pd.DataFrame() + df["mi"] = [mol_i for _ in range(len(protein_ref_indices_i) * len(protein_ref_indices_j))] + df["mj"] = [mol_j for _ in range(len(protein_ref_indices_i) * len(protein_ref_indices_j))] + df["ai"] = np.repeat(protein_ref_indices_i, len(protein_ref_indices_j)) + df["aj"] = np.tile(protein_ref_indices_j, len(protein_ref_indices_i)) + df["c12dist"] = 0.0 + df["p"] = 0.0 + df["cutoff"] = [c12_cutoff[i, j] for i in range(len(protein_ref_indices_i)) for j in range(len(protein_ref_indices_j))] else: + if not args.residue: + chunks = np.array_split(target_list, args.num_threads) + else: + chunks = [] + n_threshold = sum([len(v) for v in dict_m_m_r.values()]) // args.num_threads + chunk = [] + n = 0 + for k, v in dict_m_m_r.items(): + chunk.append(v) + n += len(v) + if n > n_threshold: + chunks.append(chunk) + chunk = [] + n = 0 + chunks.append(chunk) + pool = multiprocessing.Pool(args.num_threads) + if args.residue and not args.intra: + results = pool.map( + run_residue_inter_, + [ + ( + args, + protein_ref_indices_i, + protein_ref_indices_j, + len(ref_ri_to_ai_j), + c12_cutoff, + mol_i, + mol_j, + (ref_ai_to_ri_i, index_ai_to_ri_j), + x, + ) + for x in chunks + ], + ) + else: - results = pool.map( - run_mat_, - [ - ( - args, - protein_ref_indices_i, - protein_ref_indices_j, - original_size_j, - c12_cutoff, - mol_i, - mol_j, - x, - mat_type, - ) - for x in chunks - ], - ) + results = pool.map( + run_mat_, + [ + ( + args, + protein_ref_indices_i, + protein_ref_indices_j, + original_size_j, + c12_cutoff, + mol_i, + mol_j, + x, + mat_type, + ) + for x in chunks + ], + ) - pool.close() - pool.join() + pool.close() + pool.join() - ######################## - # PARALLEL PROCESS END - ######################## + ######################## + # PARALLEL PROCESS END + ######################## - # concatenate and remove partial dataframes - for name in results: - try: - part_df = pd.read_csv(name) - df = pd.concat([df, part_df]) - except pd.errors.EmptyDataError: - print(f"Ignoring partial dataframe in {name} as csv is empty") - [os.remove(name) for name in results] - df = df.astype({"mi": "int32", "mj": "int32", "ai": "int32", "aj": "int32"}) + # concatenate and remove partial dataframes + for name in results: + try: + part_df = pd.read_csv(name) + df = pd.concat([df, part_df]) + except pd.errors.EmptyDataError: + print(f"Ignoring partial dataframe in {name} as csv is empty") + [os.remove(name) for name in results] + df = df.astype({"mi": "int32", "mj": "int32", "ai": "int32", "aj": "int32"}) - df = df.sort_values(by=["mi", "mj", "ai", "aj"]) - df.drop_duplicates(subset=["mi", "ai", "mj", "aj"], inplace=True) + df = df.sort_values(by=["mi", "mj", "ai", "aj"]) + df.drop_duplicates(subset=["mi", "ai", "mj", "aj"], inplace=True) df["mi"] = df["mi"].map("{:}".format) df["mj"] = df["mj"].map("{:}".format) @@ -848,7 +855,7 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref parser.add_argument( "--histo", type=str, - required=True, + required=False, help='Path to the directory containing the histograms. The histogram files should contain the prefix "intra_" for intra molecular contact descriptions and "inter_" for inter molecular.', ) parser.add_argument( @@ -907,18 +914,26 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref ) args = parser.parse_args() + # check either histo or zero flag are set + if not args.histo and not args.zero: + raise ValueError("Either --histo or --zero flag must be set.") + if args.histo and args.zero: + raise ValueError("Both --histo and --zero flags cannot be set at the same time.") + # check if output file exists if not os.path.exists(args.out): print(f"The path '{args.out}' does not exist.") sys.exit() - if not args.tar and not os.path.isdir(args.histo): - print(f"The path '{args.histo}' is not a directory.") - sys.exit() + if not args.zero and not args.tar: + if not os.path.isdir(args.histo): + print(f"The path '{args.histo}' is not a directory.") + sys.exit() - if args.tar and not tarfile.is_tarfile(args.histo): - print(f"The path '{args.histo}' is not a tar file.") - sys.exit() + if not args.zero and args.tar: + if not tarfile.is_tarfile(args.histo): + print(f"The path '{args.histo}' is not a tar file.") + sys.exit() if args.tar and args.h5: printf("cannot use --tar and --h5, chose one.") From 328d30de2773e32a86da5c0feba4088beb0558a5 Mon Sep 17 00:00:00 2001 From: Carlo Camilloni Date: Wed, 5 Feb 2025 17:41:27 +0100 Subject: [PATCH 2/4] make_mat fix for tar --- tools/make_mat/make_mat.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/make_mat/make_mat.py b/tools/make_mat/make_mat.py index 000bb2e4..7127b1fd 100644 --- a/tools/make_mat/make_mat.py +++ b/tools/make_mat/make_mat.py @@ -49,6 +49,9 @@ def read_mat(name, protein_ref_indices, args, cumulative=False): if args.tar: with tarfile.open(args.histo, "r:*") as tar: ref_df = pd.read_csv(tar.extractfile(name), header=None, sep="\s+", usecols=[0, *protein_ref_indices]) + ref_df_columns = ["distance", *[str(x) for x in protein_ref_indices]] + ref_df.columns = ref_df_columns + ref_df.set_index("distance", inplace=True) else: if not args.h5: ref_df = pd.read_csv(f"{path_prefix}/{name}", header=None, sep="\s+", usecols=[0, *protein_ref_indices]) From 7313ea47b67666bc1fbb1806415760f147cf9377 Mon Sep 17 00:00:00 2001 From: Carlo Camilloni Date: Wed, 5 Feb 2025 18:00:06 +0100 Subject: [PATCH 3/4] make_mat test --- test/run_make_mat.sh | 1 - tools/make_mat/make_mat.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/run_make_mat.sh b/test/run_make_mat.sh index 0f8c8c54..8802a014 100755 --- a/test/run_make_mat.sh +++ b/test/run_make_mat.sh @@ -3,7 +3,6 @@ set -e set -o pipefail -tar -zxf test_inputs/make_mat_ttr/hh.tgz -C test_inputs/make_mat_ttr/ python ../tools/make_mat/make_mat.py --histo test_inputs/make_mat_ttr/hh.tgz --target_top test_inputs/make_mat_ttr/topol_md.top --mego_top test_inputs/make_mat_ttr/topol_ref.top --cutoff 0.75 --mode intra+same --out test_inputs/make_mat_ttr/ --tar diff <(gzip -dc test_inputs/make_mat_ttr/intramat_1_1.ndx.gz) <(gzip -dc test_outputs/make_mat_ttr/intramat_1_1.ndx.gz) diff <(gzip -dc test_inputs/make_mat_ttr/intermat_1_1.ndx.gz) <(gzip -dc test_outputs/make_mat_ttr/intermat_1_1.ndx.gz) diff --git a/tools/make_mat/make_mat.py b/tools/make_mat/make_mat.py index 7127b1fd..4be7d205 100644 --- a/tools/make_mat/make_mat.py +++ b/tools/make_mat/make_mat.py @@ -439,7 +439,8 @@ def c12_avg(values, weights): res = np.maximum(cutoff / 4.5, 0.1) log_exp_sum = logsumexp(1.0 / v / res, b=w) - np.log(norm) exp_aver = (1.0 / res) / log_exp_sum - # exp_aver = (1.0 / res) / np.log(np.sum(w * np.exp(1.0 / v / res)) / norm) + if exp_aver < 0.01: + exp_aver = 0 return exp_aver From 7566bec9881e87bcd4cc2e756019cb0ad73aeb40 Mon Sep 17 00:00:00 2001 From: Carlo Camilloni Date: Wed, 5 Feb 2025 18:01:05 +0100 Subject: [PATCH 4/4] flake --- tools/make_mat/make_mat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/make_mat/make_mat.py b/tools/make_mat/make_mat.py index 4be7d205..5b8b7928 100644 --- a/tools/make_mat/make_mat.py +++ b/tools/make_mat/make_mat.py @@ -61,7 +61,7 @@ def read_mat(name, protein_ref_indices, args, cumulative=False): else: with h5py.File(f"{path_prefix}/{name}", "r") as f: if "density" not in f: - raise KeyError(f"Dataset 'density' not found in {h5_file}") + raise KeyError(f"Dataset 'density' not found in {name}") data = f["density"][:] # Read full dataset # Extract the first column (distance) and the relevant protein_ref_indices columns