--- title: Liftover module keywords: fastai sidebar: home_sidebar summary: "Liftover genodata and sumstat" description: "Liftover genodata and sumstat" nb_path: "nbs/02_Liftover.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

class Liftover[source]

Liftover(fr='hg19', to='hg38')

{% endraw %} {% raw %}
{% endraw %} {% raw %}
geno.bim.shape[0]
424
{% endraw %} {% raw %}
from pathlib import Path
from cugg.genodata import *
from cugg.sumstat import Sumstat
from cugg.liftover import Liftover
def main(input_path,output_path,fr='hg19',to='hg38',remove_missing=True,rename=True):
    lf = Liftover(fr,to)
    print("liftover from " + fr +"to" +to)
    print("Removing SNPs failed to liftover is", remove_missing)
    #file type detection, sumstats, plink, vcf,gvcf, >>>future bgen
    input_path = Path(input_path)
    input_suffixes = set(input_path.suffixes)
    output_path = Path(output_path)
    if not input_path.exists(): print("The file is not exist:", input_path)
    if input_path.suffix in ['.bim','.bed','.fam']:
        geno = Genodata(str(input_path.with_suffix('.bed')))
        new_bim = lf.bim_liftover(geno.bim)
        idx = new_bim.chrom == 0
        if remove_missing:
            geno.bim = new_bim
            geno.extractbyidx(~idx)
            geno.export_plink(output_path.with_suffix('.bed'))
            print("Total number SNPs ",new_bim.shape[0],". Removing SNPs failed to liftover ", sum(idx))
        else:
            write_bim(output_path.with_suffix('.bim'),new_bim)
            print("Total number SNPs ",new_bim.shape[0],". The number of SNPs failed to liftover ", sum(idx),". Their chr and pos is replaced with 0, 0")
    elif len(input_suffixes.intersection(['.gvcf','.vcf']))>0:
        lf.vcf_liftover(input_path,output_path,remove_missing)
    else:
        print("This file is considered as sumstat format file")
        sums = Sumstat(input_path,rename=rename)
        new_sums = lf.sumstat_liftover(sums.ss,rename)
        idx = new_sums.CHR == 0
        if remove_missing:
            new_sums[~idx].to_csv(output_path, compression='gzip', sep = "\t", header = True, index = False)
            print("Total number SNPs ",new_sums.shape[0],". Removing SNPs failed to liftover ", sum(idx))
        else:
            new_sums.to_csv(output_path, compression='gzip', sep = "\t", header = True, index = False)
            print("Total number SNPs ",new_sums.shape[0],". The number of SNPs failed to liftover ", sum(idx),". Their chr and pos is replaced with 0, 0")
{% endraw %} {% raw %}
from glob import glob
{% endraw %} {% raw %}
from pathlib import Path
{% endraw %} {% raw %}
tmp = Path('data/GH.AR.SAD.P1.001.0_X3547_S42_1180478_GVCF.hard-filtered.gvcf.gz')
{% endraw %} {% raw %}
tmp.suffix[1:] in ['bim','bed','fam']
False
{% endraw %} {% raw %}
tmp.with_suffix('.bed')
Path('test.bed')
{% endraw %}

1.Test liftover vcf and gvcf

{% raw %}
lf = Liftover('hg19','hg38')
{% endraw %} {% raw %}
lf.chainmap[22][50549067]
{% endraw %} {% raw %}
vcf ='data/GH.AR.SAD.P1.001.0_X3547_S42_1180478_GVCF.hard-filtered.gvcf.gz'
{% endraw %} {% raw %}
main(vcf,'data/new_hg19_hg38_test.gvcf.gz',remove_missing=True)
liftover from hg19tohg38
Removing SNPs failed to liftover is True
Total number SNPs  93816694 . The number of SNPs failed to liftover  59995
{% endraw %} {% raw %}
lf.region_liftover([5,272741,1213528-900000])
(5, 272626, 313413)
{% endraw %} {% raw %}
lf.vcf_liftover(vcf)
{% endraw %} {% raw %}
59995/93816694
0.0006394917305442463
{% endraw %}
{% raw %}
1
{% endraw %} {% raw %}
from cugg.genodata import Genodata
{% endraw %} {% raw %}
geno = Genodata('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/full_sample/bfiles/full_sample.bed')
{% endraw %} {% raw %}
geno = Genodata('/mnt/mfs/statgen/guangyou/imputation/genome/othergenes/UKB_exome_othergenes.bed')
{% endraw %} {% raw %}
main('/mnt/mfs/statgen/guangyou/imputation/genome/othergenes/UKB_exome_othergenes.bed','test2.bed')
{% endraw %} {% raw %}
main('MWE_region_extraction/100521_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k_PC1_PC2_f3393.regenie.snp_stats.gz','test_sumstats.sumstats.gz')
liftover from hg19tohg38
Removing SNPs failed to liftover is True
This file is considered as sumstat format file
{% endraw %} {% raw %}
geno
bim:      chrom                  snp   cm        pos a0 a1     i
0         1    chr1:55039741:G:C  0.0   55039741  G  C     0
1         1    chr1:55039742:G:A  0.0   55039742  G  A     1
2         1    chr1:55039749:G:C  0.0   55039749  G  C     2
3         1    chr1:55039750:T:C  0.0   55039750  T  C     3
4         1    chr1:55039753:T:C  0.0   55039753  T  C     4
...     ...                  ...  ...        ... .. ..   ...
1408     11  chr11:116832956:T:C  0.0  116832956  T  C  1408
1409     11  chr11:116832976:C:G  0.0  116832976  C  G  1409
1410     11  chr11:116832977:T:G  0.0  116832977  T  G  1410
1411     11  chr11:116832978:T:G  0.0  116832978  T  G  1411
1412     11  chr11:116832980:T:C  0.0  116832980  T  C  1412

[1413 rows x 7 columns] 
 fam:            fid      iid father mother gender trait       i
0       1000019  1000019      0      0      2    -9       0
1       1000078  1000078      0      0      2    -9       1
2       1000081  1000081      0      0      1    -9       2
3       1000198  1000198      0      0      2    -9       3
4       1000210  1000210      0      0      1    -9       4
...         ...      ...    ...    ...    ...   ...     ...
168201  6025295  6025295      0      0      1    -9  168201
168202  6025319  6025319      0      0      2    -9  168202
168203  6025346  6025346      0      0      2    -9  168203
168204  6025363  6025363      0      0      1    -9  168204
168205  6025411  6025411      0      0      2    -9  168205

[168206 rows x 7 columns] 
 bed:dask.array<transpose, shape=(1413, 168206), dtype=float32, chunksize=(1024, 1024), chunktype=numpy.ndarray>
{% endraw %} {% raw %}
geno.extractbyidx(~(geno.bim.chrom == 1))
{% endraw %} {% raw %}
geno.export_plink('test1.bed')
Writing BED:   0%|          | 0/1 [00:00<?, ?it/s]
Writing BED: 100%|██████████| 1/1 [00:02<00:00,  2.26s/it]
Writing FAM... 

done.
Writing BIM... done.
{% endraw %}

Test liftover sumstat

{% raw %}
from cugg.sumstat import Sumstat
from cugg.liftover import Liftover
def gwas_liftover(input_path,output_path,output_unmapped,output_mapped,fr='hg19',to='hg38',remove_missing=False):
    lf = Liftover(fr,to)
    print("reading GWAS sumstat")
    sums = Sumstat(input_path)
    print("liftover from" + fr +"to" +to)
    sums1 = lf.sumstat_liftover(sums.ss)
    if remove_missing:
        sums1[sums1.CHR == 0].to_csv(output_unmapped, compression='gzip', sep = "\t", header = True, index = False)
        sums1[sums1.CHR != 0].to_csv(output_mapped, compression='gzip', sep = "\t", header = True, index = False)
    else:
        sums1.to_csv(output_path, compression='gzip', sep = "\t", header = True, index = False)
{% endraw %} {% raw %}
def gwas_liftover(input_file,output_path=None,fr='hg19',to='hg38',remove_missing=False):
    if output_path is None:
        output_path = os.path.dirname(input_file)+'/'
    basename = os.path.basename(input_file)
    lf = Liftover('hg19','hg38')
    print("reading GWAS sumstat")
    sums = Sumstat(input_path)
    print("liftover from" + fr +"to" +to)
    sums1 = lf.sumstat_liftover(sums.ss)
    if remove_missing:
        sums1[sums1.CHR == 0].to_csv(output_unmapped, compression='gzip', sep = "\t", header = True, index = False)
        sums1[sums1.CHR != 0].to_csv(output_mapped, compression='gzip', sep = "\t", header = True, index = False)
    else:
        sums1.to_csv(output_path, compression='gzip', sep = "\t", header = True, index = False)
{% endraw %} {% raw %}
sumstats_lifted = f'{cwd}/{_input:bnn}.hg38.sumstats.gz',
sumstats_unmapped = f'{cwd}/{_input:bnn}.hg38.sumstats_unmapped.gz',
sumstats_mapped = f'{cwd}/{_input:bnn}.hg38.sumstats_mapped.gz'
{% endraw %} {% raw %}
import os
{% endraw %} {% raw %}
tmp = os.path.basename(input_path)
{% endraw %} {% raw %}
os.path.splitext(tmp)
('100521_UKBB_Combined_f2247_f2257_expandedwhite_93258cases_237318ctrl_500k_PC1_PC2_f2247_f2257.regenie.snp_stats',
 '.gz')
{% endraw %} {% raw %}
os.path.dirname(input_path)+'/'
'/home/dmc2245/UKBiobank/results/REGENIE_results/results_imputed_data/2021_10_07_combined_500K/'
{% endraw %} {% raw %}
sums = Sumstat(input_path)
{% endraw %} {% raw %}
lf = Liftover('hg19','hg38')
{% endraw %} {% raw %}
lf.sumstat_liftover(sums.ss[:10])
CHR POS REF ALT SNP BETA SE P
0 1 13259 G A chr1:13259:G:A 0.434586 0.175780 0.014801
1 1 17569 C A chr1:17569:C:A -0.030568 0.795968 0.969366
2 1 17641 G A chr1:17641:G:A -0.078881 0.108663 0.467883
3 1 30741 C A chr1:30741:C:A -1.599610 0.990472 0.044798
4 1 57222 T C chr1:57222:T:C 0.031666 0.121422 0.794253
5 1 58396 T C chr1:58396:T:C 0.366266 0.172004 0.035663
6 1 62157 G A chr1:62157:G:A -0.147251 0.296105 0.618983
7 1 62595 C T chr1:62595:C:T 0.356993 0.171623 0.040096
8 1 69487 G A chr1:69487:G:A -0.559373 0.853882 0.512407
9 1 69569 T C chr1:69569:T:C 0.232690 0.216585 0.282662
{% endraw %} {% raw %}
def main(input_path,output_path,remove_missing):
    sums = read_regenie(input_path)
    sums1 = sumstat_liftover(sums)
    if remove_missing:
        sums1[sums1.CHR == 0].to_csv(output_path, sep = "\t", header = True, index = False)
    else:
        sums1.to_csv(output_path, sep = "\t", header = True, index = False)
{% endraw %} {% raw %}
input_path = '/home/dmc2245/UKBiobank/results/REGENIE_results/results_imputed_data/2021_10_07_combined_500K/100521_UKBB_Combined_f2247_f2257_expandedwhite_93258cases_237318ctrl_500k_PC1_PC2_f2247_f2257.regenie.snp_stats.gz'
output_path = ''
remove_missing = True
{% endraw %} {% raw %}
main(input_path,output_path,remove_missing)
{% endraw %} {% raw %}
':'.join([1,'1'])
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/tmp/1967998.1.high_mem.q/ipykernel_39087/731612175.py in <module>
----> 1 ':'.join([1,'1'])

TypeError: sequence item 0: expected str instance, int found
{% endraw %} {% raw %}
gen
{% endraw %}

test chr22

{% raw %}
region = [22,50519304,50549676]
exome_sumstats = Sumstat('/home/dmc2245/UKBiobank/results/REGENIE_results/results_imputed_data/010522_f2247_hearing_diff_200K_imputed/090321_UKBB_Hearing_difficulty_f2247_expandedwhite_45502cases_96601ctrl_PC1_2_f2247.regenie.snp_stats.gz')
exome_sumstats.extractbyregion(region)
this region [22, 50519304, 50549676] has 669 SNPs in Sumstat
{% endraw %} {% raw %}
print('1.2. LiftOver the region')
hg38toimpref = Liftover('hg38','hg19')
imp_region = hg38toimpref.region_liftover(region)
imput_sumstats.extractbyregion(imp_region)

print('1.3. Regional SNPs Liftover')
impreftohg38 = Liftover(imp_ref,'hg38') #oppsite with hg38toimpref
impreftohg38.
{% endraw %}