Skip to content

Commit

Permalink
Hj/nxf comp/update bivar (#27)
Browse files Browse the repository at this point in the history
* revert some files to main.

* fix the known sorting issue.

* make process_gene same as main_fix_sort branch

* conf4 are now the same.

* force var_grp to be sorted by the index.

* add indexing sorting for the var index.

* temporal off of the dedup.

* change the sorting for index of each group df.

* make confidence sorting stable.
  • Loading branch information
hyunhwan-bcm authored Aug 1, 2024
1 parent 159d7f9 commit 95af064
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 64 deletions.
3 changes: 0 additions & 3 deletions bin/add_c_nc.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@ def add_c_nc(score, ref):
bl = clin_nc.new_start.values
bc = clin_nc.new_chr.values

bc[bc=='MT'] = -1 # Treat MT chromosome as NaN values to avoid string comparison
bc = bc.astype(int)

i, j = np.where((a[:, None] >= bl) & (a[:, None] <= bh) & (ac[:, None] == bc))

cln = pd.concat(
Expand Down
3 changes: 2 additions & 1 deletion bin/extraModel/generate_bivar_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):
gene_var_pairs = []
for geneEnsId, grouped_df in geneEnsId_varId_df.groupby('geneEnsId'):
# Choose upto 6 the most meaningful variants.
grouped_df.sort_index()
if grouped_df.shape[0] > 6:
grouped_df = grouped_df \
.join(default_pred, on='varId') \
Expand All @@ -68,7 +69,7 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):

# Remove all the duplicated variant pairs.
gene_var_pairs_df = pd.DataFrame(gene_var_pairs)
gene_var_pairs_df = gene_var_pairs_df.drop_duplicates(['varId1', 'varId2'])
# gene_var_pairs_df = gene_var_pairs_df.drop_duplicates(['varId1', 'varId2'])

# Use only subset columns of features
subset_feature_names = "diffuse_Phrank_STRING,hgmdSymptomScore,omimSymMatchFlag,hgmdSymMatchFlag,clinVarSymMatchFlag,omimGeneFound,omimVarFound,hgmdGeneFound,hgmdVarFound,clinVarVarFound,clinVarGeneFound,clinvarNumP,clinvarNumLP,clinvarNumLB,clinvarNumB,dgvVarFound,decipherVarFound,curationScoreHGMD,curationScoreOMIM,curationScoreClinVar,conservationScoreDGV,omimSymptomSimScore,hgmdSymptomSimScore,GERPpp_RS,gnomadAF,gnomadAFg,LRT_score,LRT_Omega,phyloP100way_vertebrate,gnomadGeneZscore,gnomadGenePLI,gnomadGeneOELof,gnomadGeneOELofUpper,IMPACT,CADD_phred,CADD_PHRED,DANN_score,REVEL_score,fathmm_MKL_coding_score,conservationScoreGnomad,conservationScoreOELof,Polyphen2_HDIV_score,Polyphen2_HVAR_score,SIFT_score,zyg,FATHMM_score,M_CAP_score,MutationAssessor_score,ESP6500_AA_AF,ESP6500_EA_AF,hom,hgmd_rs,spliceAImax,nc_ClinVar_Exp,nc_HGMD_Exp,nc_isPLP,nc_isBLB,c_isPLP,c_isBLB,nc_CLNREVSTAT,c_CLNREVSTAT,nc_RANKSCORE,c_RANKSCORE,CLASS,phrank,isB/LB,isP/LP,cons_transcript_ablation,cons_splice_acceptor_variant,cons_splice_donor_variant,cons_stop_gained,cons_frameshift_variant,cons_stop_lost,cons_start_lost,cons_transcript_amplification,cons_inframe_insertion,cons_inframe_deletion,cons_missense_variant,cons_protein_altering_variant,cons_splice_region_variant,cons_splice_donor_5th_base_variant,cons_splice_donor_region_variant,c_ClinVar_Exp_Del_to_Missense,c_ClinVar_Exp_Different_pChange,c_ClinVar_Exp_Same_pChange,c_HGMD_Exp_Del_to_Missense,c_HGMD_Exp_Different_pChange,c_HGMD_Exp_Same_pChange,c_HGMD_Exp_Stop_Loss,c_HGMD_Exp_Start_Loss,IMPACT.from.Tier,TierAD,TierAR,TierAR.adj,No.Var.HM,No.Var.H,No.Var.M,No.Var.L,AD.matched,AR.matched,recessive,dominant,simple_repeat".split(',')
Expand Down
4 changes: 3 additions & 1 deletion bin/extraModel/integrate_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ def process_recessive_matrix(df):

result = []
for _, var_grp in var_grps:
var_grp = var_grp.sort_values("predict", ascending=False)
var_grp = var_grp.sort_values("var2", kind="stable")
var_grp = var_grp.sort_values("predict", ascending=False, kind="stable")
result.append(var_grp.iloc[[0], :])
result = pd.concat(result)
result.set_index("var1", inplace=True)
Expand All @@ -31,6 +32,7 @@ def integrate_output(prj_folder, data_folder, sample_id):
expanded_df = pd.read_csv(
expanded_fn, sep="\t", index_col=0, compression="infer"
)
expanded_df = expanded_df.sort_index()
# expanded_df.append(df)
# expanded_df = pd.concat(expanded_df)
# expanded_df = expanded_df.loc[~expanded_df.index.duplicated(keep='first')]
Expand Down
3 changes: 2 additions & 1 deletion bin/extraModel_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,9 @@ def AIM(data_folder, sample_id):
df_pred.loc[:, model_dict[mn]["features"]]
)[:, 1]
df_pred.insert(loc=df_pred.shape[1] - 1, column="predict", value=predict)
df_pred = df_pred.sort_index()
df_pred = assign_confidence_score(model_dict[mn]["ref"], df_pred)
df_pred = df_pred.sort_values("confidence", ascending=False)
df_pred = df_pred.sort_values("confidence", ascending=False, kind="stable")
df_pred = assign_ranking(df_pred)
df_pred.to_csv(f"{out_folder}/{sample_id}_{mn}_predictions.csv")
else:
Expand Down
Loading

0 comments on commit 95af064

Please sign in to comment.