--- title: Title keywords: fastai sidebar: home_sidebar nb_path: "nbs/06_SNPmatch.ipynb" ---
pip install biopython
For region extraction case, I think we only need to consider scenario 1, which make genotype consistent in different softwares by shifting the sign of beta in sumstats. For merging exome and imput data, it is not necessary to consider the SNPs' overlapping in defferent datasets. If two SNPs are the same, both of them will be show in a credit set.
region = [5,272741,1213528]
geno_path = '../MWE_region_extraction/ukb23156_c5.merged.filtered.5_272741_1213528.bed'
sumstats_path = '../MWE_region_extraction/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2_f3393.regenie.snp_stats'
pheno_path = None
unr_path = 'MWE_region_extraction/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.white_europeans.filtered.092821_ldprun_unrelated.filtered.prune.txt'
imp_geno_path = '../MWE_region_extraction/ukb_imp_chr5_v3_05_272856_1213643.bgen'
imp_sumstats_path = '../MWE_region_extraction/100521_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k_PC1_PC2_f3393.regenie.snp_stats.gz'
imp_ref = 'hg19'
bgen_sample_path = '../MWE_region_extraction/ukb_imp_chr5_v3_05_272856_1213643.sample'
output_sumstats = 'test.snp_stats.gz'
output_LD = 'test_corr.csv.gz'
#main(region,geno_path,sumstats_path,pheno_path,unr_path,imp_geno_path,imp_sumstats_path,imp_ref,output_sumstats,output_LD)
imp_geno_path = '/mnt/mfs/statgen/archive/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr5_v3.bgen'
exome_sumstats = Sumstat(sumstats_path)
exome_sumstats.extractbyregion(region)
ss.index = range(len(ss))
ss.loc[ss.index[:10]]
tmp = namebyordA0_A1(exome_sumstats.ss[['CHR','POS','REF','ALT']])
tmp = pd.Series(tmp).str.split('_')
tmp
ss = exome_sumstats.ss
ss.index = pd.Series(tmp)
ss.index
tmp = ss.index.to_series().apply(lambda x: x.split(':')[0])
tmp.unique()
len(ss.index[0].split('_')[0].split(':'))
'ab_c'.split('_')[0]
ss.index.isin(ss.index)
ss.index[ss.index.duplicated(keep=False)]
list(tmp)
tmp.apply(lambda x: x[0])
exome_geno = Genodata(geno_path)
bim = exome_geno.bim
check_ss1(exome_sumstats.ss,bim)
imput_sumstats = Sumstat(imp_sumstats_path)
imput_geno = Genodata(imp_geno_path,bgen_sample_path)
check_ss1(imput_sumstats.ss,imput_geno.bim)
ss
bim
imput_sumstats
region = [5, 73776529, 73849020]
hg38toimpref = Liftover('hg38','hg19')
imp_region = hg38toimpref.region_liftover(region)
imput_sumstats.extractbyregion(imp_region)
imput_geno.extractbyregion(imp_region)
imput_sumstats.match_ss(imput_geno.bim)
imput_geno.geno_in_stat(imput_sumstats.ss)
a=exome_sumstats.sample(n=1000)
a = exome_sumstats.ss
a = a.sort_index()
aa = a.copy()
aa.REF = list(a.ALT)
aa.ALT = list(a.REF)
tmp =compare_snps(imput_sumstats.ss,exome_sumstats.ss)
tmp
imput_sumstats.ss.loc[tmp.qidx[tmp.exact==False].drop_duplicates()]
imput_sumstats.ss.loc[tmp.qidx[tmp.exact==False]]
print(tmp.iloc[:,:6].value_counts())
sum(tmp['query']==-1)
tmp[tmp.reverse]
tmp[tmp.both]
tmp.flip.value_counts()
tmp[tmp.flip==False]
a[10:]
tmp[tmp.exact==True]
sum(tmp['query'] == tmp['subject'])
tmp.query == tmp.subject
a.loc[[811401,811415],:]
aa.loc[[811367,811400],:]
tmp.exact.value_counts()
a.POS.value_counts()
a
smry = []
query = a[:10].itertuples()
subject = a[100:210].itertuples()
qi,si = next(query,None),next(subject,None)
multi_snps = []
while(qi and si):
if qi[1]>si[1]:
si = next(subject,None)
multi_snps = []
continue
elif qi[1]<si[1]:
qi = next(query,None)
if len(multi_snps)==0:
smry.append([False]*5+[-1,-1])
else:
for s in multi_snps:
smry.append(snp_match(qi[3],qi[4],s[3],s[4])+[qi[0],s[0]])
continue
else:
if qi[2]>si[2]:
si = next(subject,None)
multi_snps = []
continue
elif qi[2]<si[2]:
qi = next(query,None)
if len(multi_snps)==0:
smry.append(np.array([False]*5))
else:
for s in multi_snps:
smry.append(snp_match(qi[3],qi[4],s[3],s[4])+[qi[0],s[0]])
continue
else:
#same pos has multiple snps
#query compare with each of them in subject
multi_snps.append(si)
smry.append(snp_match(qi[3],qi[4],si[3],si[4])+[qi[0],si[0]])
si = next(subject,None)
smry = pd.DataFrame(smry)
smry
ai = a[:5].itertuples()
for i in ai:
print(i)
tmp = next(ai,None)
print('tmp',tmp)
def snps_match(query,subject,keep_ambiguous=True):
query.index = query.iloc[:,:2].astype(str).agg(':'.join, axis=1)
subject.index = subject.iloc[:,:2].astype(str).agg(':'.join, axis=1)
#overlap snps by chr+pos
print("Total rows of query: ",query.shape[0],"Total rows of subject: ",subject.shape[0])
subject = subject[subject.index.isin(query.index)]
query = query.loc[subject.index]
print("Overlap chr:pos",query.shape[0])
if query.index.duplicated().any():
raise Exception("There are duplicated chr:pos")
pm = pair_match(query.ALT,query.REF,subject.ALT,subject.REF)
if keep_ambiguous:
print('Warning: there are',sum(~pm.ambiguous),'ambiguous SNPs')
pm = pm.iloc[:,1:]
else:
pm = pm[~pm.ambiguous].iloc[:,1:]
keep_idx = pm.any(axis=1)
print("Overlap SNPs",sum(keep_idx))
#overlap snps by chr+pos+alleles.
new_subject = subject[keep_idx]
#update beta and snp info
new_query = pd.concat([new_subject.iloc[:,:5],query[keep_idx].iloc[:,5:]],axis=1)
new_query.BETA[pm.sign_flip] = -new_query.BETA[pm.sign_flip]
return new_query,new_subject
def pair_match(a1,a2,ref1,ref2):
# a1 and a2 are the first data-set
# ref1 and ref2 are the 2nd data-set
# Make all the alleles into upper-case, as A,T,C,G:
a1 = a1.str.upper()
a2 = a2.str.upper()
ref1 = ref1.str.upper()
ref2 = ref2.str.upper()
# Strand flip, to change the allele representation in the 2nd data-set
flip1 = ref1.apply(strand_flip)
flip2 = ref2.apply(strand_flip)
result = {}
result["ambiguous"] = ((a1=="A") & (a2=="T")) | ((a1=="T") & (a2=="A")) | ((a1=="C") & (a2=="G")) | ((a1=="G") & (a2=="C"))
# as long as scenario 1 is involved, sign_flip will return TRUE
result["sign_flip"] = ((a1==ref2) & (a2==ref1)) | ((a1==flip2) & (a2==flip1))
# as long as scenario 2 is involved, strand_flip will return TRUE
result["strand_flip"] = ((a1==flip1) & (a2==flip2)) | ((a1==flip2) & (a2==flip1))
# remove other cases, eg, tri-allelic, one dataset is A C, the other is A G, for example.
result["exact_match"] = ((a1 == ref1) & (a2 == ref2))
return pd.DataFrame(result)
def strand_flip(s):
return ''.join(Seq(s).reverse_complement())
exome_sumstats = Sumstat(sumstats_path)
#exome_sumstats.extractbyregion(region)
a=exome_sumstats.ss.sample(n=1000)
a = a.sort_index()
a
ss1 = a[20:520].copy()
ss1 = ss1[~ss1.POS.duplicated()]
def reverse_refalt(ss):
ss = ss.copy()
ref = ss.REF.copy()
ss.REF = ss.ALT
ss.ALT = ref
ss.BETA = -ss.BETA
return ss
def flip_snps(ss):
ss = ss.copy()
ss.REF = [strand_flip(i) for i in ss.REF]
ss.ALT = [strand_flip(i) for i in ss.ALT]
return ss
snps_match(flip_snps(ss1),a,keep_ambiguous=True)
snps_match(reverse_refalt(ss1),a)
snps_match(flip_snps(reverse_refalt(ss1)),a)
a.to_csv('data/testflip/snps1000.regenie.snp_stats.gz', sep = "\t", header = True, index = False,compression='gzip')
ss1.columns = ['CHR','POS','REF','ALT','SNP','BETA','SE','P']
ss1.to_csv('data/testflip/snps500.regenie.snp_stats.gz', sep = "\t", header = True, index = False,compression='gzip')
fss1 = flip_snps(ss1)
fss1.columns = ['CHR','POS','A0','A1','SNP','STAT','SE','P']
fss1.to_csv('data/testflip/flip/snps500_flip.regenie.snp_stats.gz', sep = "\t", header = True, index = False,compression='gzip')
reverse_refalt(ss1).to_csv('data/testflip/snps500_rea0a1.regenie.snp_stats.gz', sep = "\t", header = True, index = False,compression='gzip')
flip_snps(reverse_refalt(ss1)).to_csv('data/testflip/snps500_flip_rea0a1.regenie.snp_stats.gz', sep = "\t", header = True, index = False,compression='gzip')