Hj/nxf comp/update bivar (#27)

* revert some files to main. * fix the known sorting issue. * make process_gene same as main_fix_sort branch * conf4 are now the same. * force var_grp to be sorted by the index. * add indexing sorting for the var index. * temporal off of the dedup. * change the sorting for index of each group df. * make confidence sorting stable.
LiuzLab · Aug 1, 2024 · 95af064 · 95af064
1 parent 159d7f9
commit 95af064
Show file tree

Hide file tree

Showing 7 changed files with 63 additions and 64 deletions.
diff --git a/bin/add_c_nc.py b/bin/add_c_nc.py
@@ -26,9 +26,6 @@ def add_c_nc(score, ref):
     bl = clin_nc.new_start.values
     bc = clin_nc.new_chr.values
 
-    bc[bc=='MT'] = -1 # Treat MT chromosome as NaN values to avoid string comparison
-    bc = bc.astype(int)
-
     i, j = np.where((a[:, None] >= bl) & (a[:, None] <= bh) & (ac[:, None] == bc))
 
     cln = pd.concat(

diff --git a/bin/extraModel/generate_bivar_data.py b/bin/extraModel/generate_bivar_data.py
@@ -55,6 +55,7 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):
     gene_var_pairs = []
     for geneEnsId, grouped_df in geneEnsId_varId_df.groupby('geneEnsId'):
         # Choose upto 6 the most meaningful variants.
+        grouped_df.sort_index()
         if grouped_df.shape[0] > 6:
             grouped_df = grouped_df \
                 .join(default_pred, on='varId') \
@@ -68,7 +69,7 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):
 
     # Remove all the duplicated variant pairs.
     gene_var_pairs_df = pd.DataFrame(gene_var_pairs)
-    gene_var_pairs_df = gene_var_pairs_df.drop_duplicates(['varId1', 'varId2'])
+    # gene_var_pairs_df = gene_var_pairs_df.drop_duplicates(['varId1', 'varId2'])
 
     # Use only subset columns of features
     subset_feature_names = "diffuse_Phrank_STRING,hgmdSymptomScore,omimSymMatchFlag,hgmdSymMatchFlag,clinVarSymMatchFlag,omimGeneFound,omimVarFound,hgmdGeneFound,hgmdVarFound,clinVarVarFound,clinVarGeneFound,clinvarNumP,clinvarNumLP,clinvarNumLB,clinvarNumB,dgvVarFound,decipherVarFound,curationScoreHGMD,curationScoreOMIM,curationScoreClinVar,conservationScoreDGV,omimSymptomSimScore,hgmdSymptomSimScore,GERPpp_RS,gnomadAF,gnomadAFg,LRT_score,LRT_Omega,phyloP100way_vertebrate,gnomadGeneZscore,gnomadGenePLI,gnomadGeneOELof,gnomadGeneOELofUpper,IMPACT,CADD_phred,CADD_PHRED,DANN_score,REVEL_score,fathmm_MKL_coding_score,conservationScoreGnomad,conservationScoreOELof,Polyphen2_HDIV_score,Polyphen2_HVAR_score,SIFT_score,zyg,FATHMM_score,M_CAP_score,MutationAssessor_score,ESP6500_AA_AF,ESP6500_EA_AF,hom,hgmd_rs,spliceAImax,nc_ClinVar_Exp,nc_HGMD_Exp,nc_isPLP,nc_isBLB,c_isPLP,c_isBLB,nc_CLNREVSTAT,c_CLNREVSTAT,nc_RANKSCORE,c_RANKSCORE,CLASS,phrank,isB/LB,isP/LP,cons_transcript_ablation,cons_splice_acceptor_variant,cons_splice_donor_variant,cons_stop_gained,cons_frameshift_variant,cons_stop_lost,cons_start_lost,cons_transcript_amplification,cons_inframe_insertion,cons_inframe_deletion,cons_missense_variant,cons_protein_altering_variant,cons_splice_region_variant,cons_splice_donor_5th_base_variant,cons_splice_donor_region_variant,c_ClinVar_Exp_Del_to_Missense,c_ClinVar_Exp_Different_pChange,c_ClinVar_Exp_Same_pChange,c_HGMD_Exp_Del_to_Missense,c_HGMD_Exp_Different_pChange,c_HGMD_Exp_Same_pChange,c_HGMD_Exp_Stop_Loss,c_HGMD_Exp_Start_Loss,IMPACT.from.Tier,TierAD,TierAR,TierAR.adj,No.Var.HM,No.Var.H,No.Var.M,No.Var.L,AD.matched,AR.matched,recessive,dominant,simple_repeat".split(',')

diff --git a/bin/extraModel/integrate_output.py b/bin/extraModel/integrate_output.py
@@ -11,7 +11,8 @@ def process_recessive_matrix(df):
 
     result = []
     for _, var_grp in var_grps:
-        var_grp = var_grp.sort_values("predict", ascending=False)
+        var_grp = var_grp.sort_values("var2", kind="stable")        
+        var_grp = var_grp.sort_values("predict", ascending=False, kind="stable") 
         result.append(var_grp.iloc[[0], :])
     result = pd.concat(result)
     result.set_index("var1", inplace=True)
@@ -31,6 +32,7 @@ def integrate_output(prj_folder, data_folder, sample_id):
         expanded_df = pd.read_csv(
             expanded_fn, sep="\t", index_col=0, compression="infer"
         )
+    expanded_df = expanded_df.sort_index()
     # expanded_df.append(df)
     # expanded_df = pd.concat(expanded_df)
     # expanded_df = expanded_df.loc[~expanded_df.index.duplicated(keep='first')]

diff --git a/bin/extraModel_main.py b/bin/extraModel_main.py
@@ -104,8 +104,9 @@ def AIM(data_folder, sample_id):
                 df_pred.loc[:, model_dict[mn]["features"]]
             )[:, 1]
             df_pred.insert(loc=df_pred.shape[1] - 1, column="predict", value=predict)
+            df_pred = df_pred.sort_index()
             df_pred = assign_confidence_score(model_dict[mn]["ref"], df_pred)
-            df_pred = df_pred.sort_values("confidence", ascending=False)
+            df_pred = df_pred.sort_values("confidence", ascending=False, kind="stable")
             df_pred = assign_ranking(df_pred)
             df_pred.to_csv(f"{out_folder}/{sample_id}_{mn}_predictions.csv")
     else: