--- title: Title keywords: fastai sidebar: home_sidebar nb_path: "nbs/Untitled.ipynb" ---
import os
import yaml
import glob
import pandas as pd
from Bio.Seq import Seq
pd.read_csv('../xqtl-pipeline/pipeline/misc/data/yml_list.txt',sep = "\t").values.tolist()
merge_sumstats('../xqtl-pipeline/pipeline/misc/data/template.yml',keep_ambiguous=False)
yml = load_yaml('../xqtl-pipeline/pipeline/misc/data/template.yml')
input_dict = parse_input(yml['INPUT'])
target_dict = parse_input(yml['TARGET'])
output_path = yml['OUTPUT']
yml['TARGET']
input_dict
target_dict
list(target_dict.values())
def merge_sumstats(yml,keep_ambiguous):
#parse yaml
yml = load_yaml(yml)
input_dict = parse_input(yml['INPUT'])
target_dict = parse_input(yml['TARGET'])
output_path = yml['OUTPUT']
input_dict[list(target_dict.keys())[0]] = list(target_dict.values())[0]
lst_sumstats_file = [os.path.basename(i) for i in input_dict.keys()]
print('Total number of sumstats: ',len(lst_sumstats_file))
if len(set(lst_sumstats_file))<len(lst_sumstats_file):
raise Exception("There are duplicated names in ", lst_sumstats_file)
#read all sumstats
print(input_dict)
lst_sumstats = {os.path.basename(i):read_sumstat(i,j) for i,j in input_dict.items()}
nqs = []
for query in lst_sumstats.values():
nq,_ = snps_match(query,lst_sumstats[os.path.basename(list(target_dict.keys())[0])],keep_ambiguous)
nqs.append(nq)
#get common snps
common_snps = set.intersection(*[set(nq.SNP) for nq in nqs])
print('Total number of common SNPs: ',len(common_snps))
#write out new smustats
for output_sumstats,nq in zip(lst_sumstats_file,nqs):
sumstats = nq[nq.SNP.isin(common_snps)]
sumstats.to_csv(os.path.join(output_path, output_sumstats), sep = "\t", header = True, index = False,compression='gzip')
print('All are done!!!')
def load_yaml(yaml_file):
with open(yaml_file, "r") as stream:
try:
yml = yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
return yml
def parse_input(yml_input):
input_dict = {}
for i in yml_input:
for name in glob.glob(list(i.keys())[0]):
input_dict[name] = list(i.values())[0].copy()
return input_dict
a_dict = {"a": 1, "B": 2, "C": 3}
a_dict.pop('a')
a_dict
'a,b'.split(',')
def read_sumstat(file, config=None):
print(file,config)
try:
sumstats = pd.read_csv(file, compression='gzip', header=0, sep='\t', quotechar='"')
except:
sumstats = pd.read_csv(file, header=0, sep='\t', quotechar='"')
if config is not None:
try:
sumstats.index = sumstats.loc[:,config.pop('ID').split(',')].astype(str).agg(':'.join, axis=1)
sumstats = sumstats.loc[:,list(config.values())]
except:
raise ValueError(f'According to config_file, input summary statistics should have the following columns: %s' % list(config.values()))
sumstats.columns = list(config.keys())
sumstats.SNP = 'chr'+sumstats.CHR.astype(str) + ':' + sumstats.POS.astype(str) + ':' + sumstats.A0.astype(str) + ':' + sumstats.A1.astype(str)
sumstats.CHR = sumstats.CHR.astype(int)
sumstats.POS = sumstats.POS.astype(int)
return sumstats
['a','b'].remove('a')
def snps_match(query,subject,keep_ambiguous=True):
query.index = query.iloc[:,:2].astype(str).agg(':'.join, axis=1)
subject.index = subject.iloc[:,:2].astype(str).agg(':'.join, axis=1)
#overlap snps by chr+pos
print("Total rows of query: ",query.shape[0],"Total rows of subject: ",subject.shape[0])
subject = subject[subject.index.isin(query.index)]
query = query.loc[subject.index]
print("Overlap chr:pos",query.shape[0])
if query.index.duplicated().any():
raise Exception("There are duplicated chr:pos")
pm = pair_match(query.A1,query.A0,subject.A1,subject.A0)
if keep_ambiguous:
print('Warning: there are',sum(~pm.ambiguous),'ambiguous SNPs')
pm = pm.iloc[:,1:]
else:
pm = pm[~pm.ambiguous].iloc[:,1:]
print(pm)
keep_idx = pm.any(axis=1)
keep_idx = keep_idx.index[keep_idx==True]
print("Overlap SNPs",len(keep_idx))
#overlap snps by chr+pos+alleles.
new_subject = subject.loc[keep_idx]
#update beta and snp info
new_query = pd.concat([new_subject.iloc[:,:5],query.loc[keep_idx].iloc[:,5:]],axis=1)
new_query.STAT[pm.sign_flip] = -new_query.STAT[pm.sign_flip]
return new_query,new_subject
def pair_match(a1,a2,ref1,ref2):
# a1 and a2 are the first data-set
# ref1 and ref2 are the 2nd data-set
# Make all the alleles into upper-case, as A,T,C,G:
a1 = a1.str.upper()
a2 = a2.str.upper()
ref1 = ref1.str.upper()
ref2 = ref2.str.upper()
# Strand flip, to change the allele representation in the 2nd data-set
flip1 = ref1.apply(strand_flip)
flip2 = ref2.apply(strand_flip)
result = {}
result["ambiguous"] = ((a1=="A") & (a2=="T")) | ((a1=="T") & (a2=="A")) | ((a1=="C") & (a2=="G")) | ((a1=="G") & (a2=="C"))
# as long as scenario 1 is involved, sign_flip will return TRUE
result["sign_flip"] = ((a1==ref2) & (a2==ref1)) | ((a1==flip2) & (a2==flip1))
# as long as scenario 2 is involved, strand_flip will return TRUE
result["strand_flip"] = ((a1==flip1) & (a2==flip2)) | ((a1==flip2) & (a2==flip1))
# remove other cases, eg, tri-allelic, one dataset is A C, the other is A G, for example.
result["exact_match"] = ((a1 == ref1) & (a2 == ref2))
return pd.DataFrame(result)
def strand_flip(s):
return ''.join(Seq(s).reverse_complement())