Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make_mat test fixes #538

Merged
merged 4 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions conda/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ dependencies:
- parmed
- gitpython
- pyyaml
- scipy
- h5py
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ ParmEd>=3.4.3
numpy>=1.26.2
pandas>=2.1.3
pyyaml>=6.0.1
gitpython>=3.1.43
gitpython>=3.1.43
h5py>=3.11.0
scipy>=1.13.0
1 change: 0 additions & 1 deletion test/run_make_mat.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
set -e
set -o pipefail

tar -zxf test_inputs/make_mat_ttr/hh.tgz -C test_inputs/make_mat_ttr/
python ../tools/make_mat/make_mat.py --histo test_inputs/make_mat_ttr/hh.tgz --target_top test_inputs/make_mat_ttr/topol_md.top --mego_top test_inputs/make_mat_ttr/topol_ref.top --cutoff 0.75 --mode intra+same --out test_inputs/make_mat_ttr/ --tar
diff <(gzip -dc test_inputs/make_mat_ttr/intramat_1_1.ndx.gz) <(gzip -dc test_outputs/make_mat_ttr/intramat_1_1.ndx.gz)
diff <(gzip -dc test_inputs/make_mat_ttr/intermat_1_1.ndx.gz) <(gzip -dc test_outputs/make_mat_ttr/intermat_1_1.ndx.gz)
Expand Down
187 changes: 103 additions & 84 deletions tools/make_mat/make_mat.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ def read_mat(name, protein_ref_indices, args, cumulative=False):
if args.tar:
with tarfile.open(args.histo, "r:*") as tar:
ref_df = pd.read_csv(tar.extractfile(name), header=None, sep="\s+", usecols=[0, *protein_ref_indices])
ref_df_columns = ["distance", *[str(x) for x in protein_ref_indices]]
ref_df.columns = ref_df_columns
ref_df.set_index("distance", inplace=True)
else:
if not args.h5:
ref_df = pd.read_csv(f"{path_prefix}/{name}", header=None, sep="\s+", usecols=[0, *protein_ref_indices])
Expand All @@ -58,7 +61,7 @@ def read_mat(name, protein_ref_indices, args, cumulative=False):
else:
with h5py.File(f"{path_prefix}/{name}", "r") as f:
if "density" not in f:
raise KeyError(f"Dataset 'density' not found in {h5_file}")
raise KeyError(f"Dataset 'density' not found in {name}")

data = f["density"][:] # Read full dataset
# Extract the first column (distance) and the relevant protein_ref_indices columns
Expand Down Expand Up @@ -135,19 +138,15 @@ def run_mat_(arguments):

if np.isin(int(ai), protein_ref_indices_i):
cut_i = np.where(protein_ref_indices_i == int(ai))[0][0]

# column mapping
ref_df = read_mat(ref_f, protein_ref_indices_j, args)
ref_df.loc[len(ref_df)] = c12_cutoff[cut_i]

# calculate data
c12_avg_ = zero_probability_decorator(c12_avg, args.zero)
calculate_probability_ = zero_probability_decorator(calculate_probability, args.zero)
get_cumulative_probability_ = zero_probability_decorator(get_cumulative_probability, args.zero)

c12dist = ref_df.apply(lambda x: c12_avg_(ref_df.index.to_numpy(), weights=x.to_numpy()), axis=0).values
c12dist = ref_df.apply(lambda x: c12_avg(ref_df.index.to_numpy(), weights=x.to_numpy()), axis=0).values
if mat_type == "intra":
p = ref_df.apply(
lambda x: calculate_probability_(ref_df.index.to_numpy(), weights=x.to_numpy()),
lambda x: calculate_probability(ref_df.index.to_numpy(), weights=x.to_numpy()),
axis=0,
).values
elif mat_type == "inter":
Expand All @@ -156,7 +155,7 @@ def run_mat_(arguments):
c_ref_df = read_mat(c_ref_f, protein_ref_indices_j, args, True)
c_ref_df.loc[len(c_ref_df)] = c12_cutoff[cut_i]
p = c_ref_df.apply(
lambda x: get_cumulative_probability_(c_ref_df.index.to_numpy(), weights=x.to_numpy()),
lambda x: get_cumulative_probability(c_ref_df.index.to_numpy(), weights=x.to_numpy()),
axis=0,
).values

Expand Down Expand Up @@ -440,7 +439,8 @@ def c12_avg(values, weights):
res = np.maximum(cutoff / 4.5, 0.1)
log_exp_sum = logsumexp(1.0 / v / res, b=w) - np.log(norm)
exp_aver = (1.0 / res) / log_exp_sum
# exp_aver = (1.0 / res) / np.log(np.sum(w * np.exp(1.0 / v / res)) / norm)
if exp_aver < 0.01:
exp_aver = 0

return exp_aver

Expand Down Expand Up @@ -752,79 +752,90 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref
# PARALLEL PROCESS START
########################

if not args.residue:
chunks = np.array_split(target_list, args.num_threads)
else:
chunks = []
n_threshold = sum([len(v) for v in dict_m_m_r.values()]) // args.num_threads
chunk = []
n = 0
for k, v in dict_m_m_r.items():
chunk.append(v)
n += len(v)
if n > n_threshold:
chunks.append(chunk)
chunk = []
n = 0
chunks.append(chunk)
pool = multiprocessing.Pool(args.num_threads)
if args.residue and not args.intra:
results = pool.map(
run_residue_inter_,
[
(
args,
protein_ref_indices_i,
protein_ref_indices_j,
len(ref_ri_to_ai_j),
c12_cutoff,
mol_i,
mol_j,
(ref_ai_to_ri_i, index_ai_to_ri_j),
x,
)
for x in chunks
],
)
if args.zero:

df = pd.DataFrame()
df["mi"] = [mol_i for _ in range(len(protein_ref_indices_i) * len(protein_ref_indices_j))]
df["mj"] = [mol_j for _ in range(len(protein_ref_indices_i) * len(protein_ref_indices_j))]
df["ai"] = np.repeat(protein_ref_indices_i, len(protein_ref_indices_j))
df["aj"] = np.tile(protein_ref_indices_j, len(protein_ref_indices_i))
df["c12dist"] = 0.0
df["p"] = 0.0
df["cutoff"] = [c12_cutoff[i, j] for i in range(len(protein_ref_indices_i)) for j in range(len(protein_ref_indices_j))]
else:
if not args.residue:
chunks = np.array_split(target_list, args.num_threads)
else:
chunks = []
n_threshold = sum([len(v) for v in dict_m_m_r.values()]) // args.num_threads
chunk = []
n = 0
for k, v in dict_m_m_r.items():
chunk.append(v)
n += len(v)
if n > n_threshold:
chunks.append(chunk)
chunk = []
n = 0
chunks.append(chunk)
pool = multiprocessing.Pool(args.num_threads)
if args.residue and not args.intra:
results = pool.map(
run_residue_inter_,
[
(
args,
protein_ref_indices_i,
protein_ref_indices_j,
len(ref_ri_to_ai_j),
c12_cutoff,
mol_i,
mol_j,
(ref_ai_to_ri_i, index_ai_to_ri_j),
x,
)
for x in chunks
],
)
else:

results = pool.map(
run_mat_,
[
(
args,
protein_ref_indices_i,
protein_ref_indices_j,
original_size_j,
c12_cutoff,
mol_i,
mol_j,
x,
mat_type,
)
for x in chunks
],
)
results = pool.map(
run_mat_,
[
(
args,
protein_ref_indices_i,
protein_ref_indices_j,
original_size_j,
c12_cutoff,
mol_i,
mol_j,
x,
mat_type,
)
for x in chunks
],
)

pool.close()
pool.join()
pool.close()
pool.join()

########################
# PARALLEL PROCESS END
########################
########################
# PARALLEL PROCESS END
########################

# concatenate and remove partial dataframes
for name in results:
try:
part_df = pd.read_csv(name)
df = pd.concat([df, part_df])
except pd.errors.EmptyDataError:
print(f"Ignoring partial dataframe in {name} as csv is empty")
[os.remove(name) for name in results]
df = df.astype({"mi": "int32", "mj": "int32", "ai": "int32", "aj": "int32"})
# concatenate and remove partial dataframes
for name in results:
try:
part_df = pd.read_csv(name)
df = pd.concat([df, part_df])
except pd.errors.EmptyDataError:
print(f"Ignoring partial dataframe in {name} as csv is empty")
[os.remove(name) for name in results]
df = df.astype({"mi": "int32", "mj": "int32", "ai": "int32", "aj": "int32"})

df = df.sort_values(by=["mi", "mj", "ai", "aj"])
df.drop_duplicates(subset=["mi", "ai", "mj", "aj"], inplace=True)
df = df.sort_values(by=["mi", "mj", "ai", "aj"])
df.drop_duplicates(subset=["mi", "ai", "mj", "aj"], inplace=True)

df["mi"] = df["mi"].map("{:}".format)
df["mj"] = df["mj"].map("{:}".format)
Expand All @@ -848,7 +859,7 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref
parser.add_argument(
"--histo",
type=str,
required=True,
required=False,
help='Path to the directory containing the histograms. The histogram files should contain the prefix "intra_" for intra molecular contact descriptions and "inter_" for inter molecular.',
)
parser.add_argument(
Expand Down Expand Up @@ -907,18 +918,26 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref
)
args = parser.parse_args()

# check either histo or zero flag are set
if not args.histo and not args.zero:
raise ValueError("Either --histo or --zero flag must be set.")
if args.histo and args.zero:
raise ValueError("Both --histo and --zero flags cannot be set at the same time.")

# check if output file exists
if not os.path.exists(args.out):
print(f"The path '{args.out}' does not exist.")
sys.exit()

if not args.tar and not os.path.isdir(args.histo):
print(f"The path '{args.histo}' is not a directory.")
sys.exit()
if not args.zero and not args.tar:
if not os.path.isdir(args.histo):
print(f"The path '{args.histo}' is not a directory.")
sys.exit()

if args.tar and not tarfile.is_tarfile(args.histo):
print(f"The path '{args.histo}' is not a tar file.")
sys.exit()
if not args.zero and args.tar:
if not tarfile.is_tarfile(args.histo):
print(f"The path '{args.histo}' is not a tar file.")
sys.exit()

if args.tar and args.h5:
printf("cannot use --tar and --h5, chose one.")
Expand Down
Loading