--- title: Title keywords: fastai sidebar: home_sidebar nb_path: "nbs/05_Debug_f3393_chr5.ipynb" ---
ref_first_path ='/home/dmc2245/UKBiobank/results/REGENIE_results/results_imputed_data/2021_10_07_f3393_500K/100521_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k_PC1_PC2_f3393.regenie.snp_stats_original_columns.gz'
non_ref_first_path = '/home/dmc2245/UKBiobank/results/REGENIE_results/results_imputed_data/2021_10_07_f3393_500K/non_ref_first/100521_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k_PC1_PC2_f3393.regenie.snp_stats_original_columns.gz'
ref_sumstats = pd.read_csv(ref_first_path, compression='gzip', header=0, sep='\t', quotechar='"')
ref_sumstats
ref_sumstats.A1FREQ.hist()
non_sumstats = pd.read_csv(non_ref_first_path, compression='gzip', header=0, sep='\t', quotechar='"')
non_sumstats
non_sumstats.A1FREQ.hist()
exome_path = '/home/dmc2245/UKBiobank/results/REGENIE_results/results_exome_data/090921_f3393_hearing_aid_200K/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2_f3393.regenie.snp_stats_original_columns.gz'
exome_sumstats1 = pd.read_csv(exome_path, compression='gzip', header=0, sep='\t', quotechar='"')
exome_sumstats1
exome_sumstats1.A1FREQ.hist()
exome_sumstats1[exome_sumstats1.A1FREQ>0.5]
Some SNPs' A1freq is more than 0.5
lf = Liftover('hg38','hg19')
[lf.chrpos_liftover(5,i) for i in [329201,698951]]
imp_geno_path = 'MWE_region_extraction/ukb_imp_chr5_v3_05_272856_1213643.bgen'
bgen_sample_path = 'MWE_region_extraction/ukb_imp_chr5_v3_05_272856_1213643.sample'
imput_geno = Genodata(imp_geno_path,bgen_sample_path)
imput_geno.extractbyregion([5,329316-10,699066+10])
imput_geno
imput_geno.bim = pd.concat([imput_geno.bim[:5],imput_geno.bim[-5:]])
imput_geno.bed = da.concatenate([imput_geno.bed[:5,:],imput_geno.bed[-5:,:]])
gn = imput_geno.bed.compute()
def a1freq(d):
a1,tol = 0.0,0.0
try:
a1 = d[2]*2
tol = d[2]*2
except:
print('no 2')
try:
a1 += d[1]
tol += d[1]*2
except:
print('no 1')
try:
tol += d[0]*2
except:
print('no 0')
return a1/tol
bim = imput_geno.bim.copy()
bim['A1freq'] = [a1freq(pd.Series(gn[i,:]).value_counts()) for i in range(10)]
bim
region = [5,329316-10,699066+10]
sub_ref_sumstat = ref_sumstats[(ref_sumstats.CHROM == region[0]) & (ref_sumstats.GENPOS >= region[1]) & (ref_sumstats.GENPOS <= region[2])]
sub_ref_sumstat
region = [5,272755,306748]
geno_path = 'MWE_region_extraction/ukb23156_c5.merged.filtered.5_272741_1213528.bed'
sumstats_path = 'MWE_region_extraction/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2_f3393.regenie.snp_stats'
exome_sumstats = Sumstat(sumstats_path)
exome_geno = Genodata(geno_path)
exome_sumstats.extractbyregion(region)
exome_geno.geno_in_stat(exome_sumstats.ss)
exome_geno
eg = exome_geno.bed[[1,-1],:].compute()
[a1freq(pd.Series(eg[i,:]).value_counts()) for i in range(2)]
[pd.Series(eg[i,:]).value_counts() for i in range(2)]
exome_sumstats1[(exome_sumstats1.CHROM == region[0]) & (exome_sumstats1.GENPOS >= region[1]) & (exome_sumstats1.GENPOS <= region[2])]