--- title: Genodata module keywords: fastai sidebar: home_sidebar summary: "read and extract genodata" description: "read and extract genodata" nb_path: "nbs/00_Genodata.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

read_bgen[source]

read_bgen(file, sample_file=None, pybgen=True)

the function to read genotype data

{% endraw %} {% raw %}
{% endraw %} {% raw %}

read_bim[source]

read_bim(fn)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

bgen2dask[source]

bgen2dask(bgen, index, step=500)

The function to covert bgen to dask array

{% endraw %} {% raw %}
{% endraw %} {% raw %}

pybgen_region[source]

pybgen_region(bgen, region, step=100)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

extract_bed[source]

extract_bed(geno, idx, row=True, step=500, region=None)

Type Default Details
geno No Content
idx No Content
row bool True No Content
step int 500 No Content
region NoneType None row = True by variants, row = False by samples
{% endraw %} {% raw %}
{% endraw %} {% raw %}

class Genodata[source]

Genodata(geno_path, sample_path=None)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

write_plink(G, bed:Union[str, Path], bim:Union[str, Path, NoneType]=None, fam:Union[str, Path, NoneType]=None, row:str='variant', verbose:bool=True)

Write PLINK 1 binary files into a data array.

A PLINK 1 binary file set consists of three files:

  • BED: containing the genotype.
  • BIM: containing variant information.
  • FAM: containing sample information.

The user must provide the genotype (dosage) via a :class:xarray.DataArray matrix with data type :const:numpy.float32 or :const:numpy.float64. That matrix must have two named dimensions: sample and variant. The only allowed values for the genotype are: :const:0, :const:1, :const:2, and :data:math.nan.

Parameters

G Genotype with bim, bed, and fam. bed Path to a BED file. bim Path to a BIM file.It defaults to :const:None, in which case it will try to be inferred. fam Path to a FAM file. It defaults to :const:None, in which case it will try to be inferred. major It can be either :const:"sample" or :const:"variant" (recommended and default). Specify the matrix layout on the BED file. verbose :const:True for progress information; :const:False otherwise.

{% endraw %} {% raw %}

write_fam[source]

write_fam(filepath:Path, df)

{% endraw %} {% raw %}

write_bim[source]

write_bim(filepath:Path, df)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

write_bed[source]

write_bed(filepath:Path, G, row='variant', verbose=True)

Write BED file. It assumes that X is a variant-by-sample matrix.

{% endraw %} {% raw %}
{% endraw %} {% raw %}
write_plink(geno,'test.bed')
Writing BED:   0%|          | 0/1 [00:00<?, ?it/s]
Writing BED: 100%|██████████| 1/1 [00:04<00:00,  4.77s/it]
Writing FAM... 

done.
Writing BIM... done.
{% endraw %} {% raw %}
from pandas_plink import read_plink1_bin
{% endraw %} {% raw %}
geno1 = Genodata('test.bed')
{% endraw %} {% raw %}
geno1
bim:      chrom                  snp   cm        pos a0 a1     i
0         1    chr1:55039741:G:C  0.0   55039741  G  C     0
1         1    chr1:55039742:G:A  0.0   55039742  G  A     1
2         1    chr1:55039749:G:C  0.0   55039749  G  C     2
3         1    chr1:55039750:T:C  0.0   55039750  T  C     3
4         1    chr1:55039753:T:C  0.0   55039753  T  C     4
...     ...                  ...  ...        ... .. ..   ...
1408     11  chr11:116832956:T:C  0.0  116832956  T  C  1408
1409     11  chr11:116832976:C:G  0.0  116832976  C  G  1409
1410     11  chr11:116832977:T:G  0.0  116832977  T  G  1410
1411     11  chr11:116832978:T:G  0.0  116832978  T  G  1411
1412     11  chr11:116832980:T:C  0.0  116832980  T  C  1412

[1413 rows x 7 columns] 
 fam:            fid      iid father mother gender trait       i
0       1000019  1000019      0      0      2    -9       0
1       1000078  1000078      0      0      2    -9       1
2       1000081  1000081      0      0      1    -9       2
3       1000198  1000198      0      0      2    -9       3
4       1000210  1000210      0      0      1    -9       4
...         ...      ...    ...    ...    ...   ...     ...
168201  6025295  6025295      0      0      1    -9  168201
168202  6025319  6025319      0      0      2    -9  168202
168203  6025346  6025346      0      0      2    -9  168203
168204  6025363  6025363      0      0      1    -9  168204
168205  6025411  6025411      0      0      2    -9  168205

[168206 rows x 7 columns] 
 bed:dask.array<transpose, shape=(1413, 168206), dtype=float32, chunksize=(1024, 1024), chunktype=numpy.ndarray>
{% endraw %}

Test

{% raw %}
geno_path ='/home/dmc2245/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c1.merged.filtered.bed'
{% endraw %} {% raw %}
region = [5,272741,1213528-900000]
geno_path = 'MWE_region_extraction/ukb23156_c5.merged.filtered.5_272741_1213528.bed'
sumstats_path = 'MWE_region_extraction/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2_f3393.regenie.snp_stats'
pheno_path = None
unr_path = 'MWE_region_extraction/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.white_europeans.filtered.092821_ldprun_unrelated.filtered.prune.txt'
imp_geno_path = 'MWE_region_extraction/ukb_imp_chr5_v3_05_272856_1213643.bgen'
imp_sumstats_path = 'MWE_region_extraction/100521_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k_PC1_PC2_f3393.regenie.snp_stats'
imp_ref = 'hg19'

output_sumstats = 'test.snp_stats'
output_LD = 'test_corr.csv'

#main(region,geno_path,sumstats_path,pheno_path,unr_path,imp_geno_path,imp_sumstats_path,imp_ref,output_sumstats,output_LD)
{% endraw %} {% raw %}
from pandas_plink import Chunk
{% endraw %} {% raw %}
Chunk(512,512)
Chunk(nsamples=512, nvariants=512)
{% endraw %} {% raw %}
exome_geno.extractbyvariants(exome_geno.bim.snp[:50])
{% endraw %} {% raw %}
exome_geno.extractbysamples(exome_geno.fam.iid[:60])
{% endraw %} {% raw %}
from cugg.sumstat import *
{% endraw %} {% raw %}
region = [5, 272741, 1213528]
{% endraw %} {% raw %}
imput_sumstats = Sumstat('/home/dmc2245/UKBiobank/results/REGENIE_results/results_imputed_data/2021_10_07_f3393_500K/100521_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k_PC1_PC2_f3393.regenie.snp_stats.gz')
{% endraw %} {% raw %}
imput_sumstats.extractbyregion(region)
{% endraw %} {% raw %}
imput_sumstats
sumstat:         CHR      POS REF ALT                SNP      BETA        SE         P
6767726    5   272851   A   G    chr5:272851:A:G  0.357496  0.888197  0.687318
6767727    5   272906   A   C    chr5:272906:A:C -0.003007  0.019764  0.879070
6767728    5   273143   A   G    chr5:273143:A:G -0.013693  0.016716  0.412684
6767729    5   273160   G   C    chr5:273160:G:C  0.235713  0.348772  0.499145
6767730    5   273534   C   T    chr5:273534:C:T  0.050095  0.139496  0.719509
...      ...      ...  ..  ..                ...       ...       ...       ...
6776191    5  1213094   C   T   chr5:1213094:C:T -0.015881  0.023298  0.495462
6776192    5  1213134   G   A   chr5:1213134:G:A -1.142280  1.344380  0.395509
6776193    5  1213223   C   T   chr5:1213223:C:T -0.003009  0.013631  0.825270
6776194    5  1213404   T  TC  chr5:1213404:T:TC -0.039146  0.117837  0.739735
6776195    5  1213510   C   T   chr5:1213510:C:T  0.009318  0.012922  0.470845

[8470 rows x 8 columns]
{% endraw %} {% raw %}
bgen = PyBGEN(geno_file)
sample_file = geno_file.replace('.bgen', '.sample')
if not os.path.isfile(sample_file):
    if not os.path.isfile(${bgen_sample_path:r}):
        raise ValueError(f"Cannot find the matching sample file ``{sample_file}`` for ``{geno_file}``.\nYou can specify path to sample file for all BGEN files using ``--bgen-sample-path``.")
    else:
        sample_file = ${bgen_sample_path:r}
bgen_fam = pd.read_csv(sample_file, header=0, delim_whitespace=True, quotechar='"',skiprows=1)
bgen_fam.columns = ['fid','iid','missing','sex']
geno = [bgen,bgen_fam]
{% endraw %} {% raw %}
bgen_sample_path = '/home/dmc2245/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb32285_imputedindiv.sample'
imput_geno = Genodata(imp_geno_path,bgen_sample_path)
{% endraw %} {% raw %}
imput_geno.extractbyregion(region)
{% endraw %} {% raw %}
imput_geno.extractbyvariants(list(imput_geno.bim.snp[10:20]))
{% endraw %} {% raw %}
imput_geno.extractbysamples(list(imput_geno.fam.iid[50:100]))
{% endraw %} {% raw %}
imput_geno
bim:    chrom              snp   cm     pos a0 a1   i
10      5  chr5:273143:A:G  0.0  273143  A  G  10
11      5  chr5:273160:G:C  0.0  273160  G  C  11
12      5  chr5:273209:A:G  0.0  273209  A  G  12
13      5  chr5:273212:T:C  0.0  273212  T  C  13
14      5  chr5:273237:T:G  0.0  273237  T  G  14
15      5  chr5:273261:C:T  0.0  273261  C  T  15
16      5  chr5:273267:G:A  0.0  273267  G  A  16
17      5  chr5:273310:A:G  0.0  273310  A  G  17
18      5  chr5:273319:A:G  0.0  273319  A  G  18
19      5  chr5:273326:T:C  0.0  273326  T  C  19 
 fam:        fid      iid  missing  sex
50  1426232  1426232        0    1
51  3769979  3769979        0    1
52  1751431  1751431        0    2
53  4658109  4658109        0    2
54  3538677  3538677        0    2
55  3542585  3542585        0    2
56  3249692  3249692        0    2
57  2299845  2299845        0    1
58  4136172  4136172        0    2
59  5406314  5406314        0    1
60  5987848  5987848        0    2
61  3872614  3872614        0    1
62  4818195  4818195        0    2
63  1935571  1935571        0    2
64  3585286  3585286        0    1
65  2989866  2989866        0    2
66  1474609  1474609        0    1
67  1275311  1275311        0    2
68  5471349  5471349        0    2
69  3699731  3699731        0    2
70  2314069  2314069        0    2
71  4343716  4343716        0    2
72  1245364  1245364        0    2
73  4039100  4039100        0    1
74  4652328  4652328        0    1
75  2184919  2184919        0    1
76  3340731  3340731        0    1
77  5722885  5722885        0    1
78  2149616  2149616        0    1
79  3090686  3090686        0    1
80  5925070  5925070        0    2
81  2282972  2282972        0    1
82  4720630  4720630        0    2
83  1902948  1902948        0    2
84  2017232  2017232        0    1
85  3184972  3184972        0    2
86  1135664  1135664        0    1
87  5988158  5988158        0    2
88  5088374  5088374        0    1
89  3148263  3148263        0    2
90  3548968  3548968        0    2
91  5184236  5184236        0    2
92  2963226  2963226        0    2
93  2321078  2321078        0    2
94  2867519  2867519        0    1
95  4419678  4419678        0    2
96  5830444  5830444        0    2
97  2861021  2861021        0    2
98  2375434  2375434        0    1
99  1896578  1896578        0    1 
 bed:dask.array<getitem, shape=(10, 50), dtype=int8, chunksize=(10, 50), chunktype=numpy.ndarray>
{% endraw %} {% raw %}
region
[5, 272741, 313528]
{% endraw %} {% raw %}
from pybgen import PyBGEN
{% endraw %} {% raw %}
bgen = PyBGEN(imp_geno_path,probs_only=True)
{% endraw %} {% raw %}
pybgen_region(bgen,region)
Array Chunk
Bytes 789.75 MiB 46.48 MiB
Shape (1699, 487409) (100, 487409)
Count 34 Tasks 17 Chunks
Type int8 numpy.ndarray
487409 1699
{% endraw %} {% raw %}
for t,g in bgen.iter_variants_in_region('0'+str(region[0]) if region[0]<10 else str(region[0]),region[1],region[2]):
    print(t)
{% endraw %} {% raw %}
import pandas as pd
{% endraw %} {% raw %}
tmp = bgen.iter_variants()
{% endraw %} {% raw %}
genos = []
for i,v in zip(range(bgen.nb_variants),bgen):
    geno = []
    if i % 100000 ==0:
        geno.append(v.argmax(axis=1).astype(np.int8))
        print(i,j)
0 (<Variant rs537688122 chr05:272856_A/G>, array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]]))
{% endraw %} {% raw %}
    genos = []
    n = len(index)
    for i in range(0,n,step):
        onecode_geno = bgen.read(index[i:min(n,i+step)])  #samples x variants
        geno = onecode_geno.argmax(axis=2).astype(np.int8)
        genos.append(da.from_array(geno))
{% endraw %} {% raw %}
1002 %10000
1002
{% endraw %} {% raw %}
tmp
PyBGEN(487,409 samples; 6,070,641 variants)
{% endraw %} {% raw %}
a = tmp.next()
{% endraw %} {% raw %}
import numpy as np
{% endraw %} {% raw %}
a[1]
array([[1.        , 0.        , 0.        ],
       [0.83921569, 0.16078431, 0.        ],
       [0.96078431, 0.03921569, 0.        ],
       ...,
       [1.        , 0.        , 0.        ],
       [0.00392157, 0.95686275, 0.03921569],
       [0.85490196, 0.14509804, 0.        ]])
{% endraw %} {% raw %}
aa = a[1].argmax(axis=1).astype(np.int8)
{% endraw %} {% raw %}
pd.Series(aa).value_counts()
0    455061
1     31952
2       396
dtype: int64
{% endraw %} {% raw %}
tmp = []
for i,t in enumerate(bgen.iter_variant_info()):
    tmp.append([int(t.chrom),t.name,0.0,t.pos,t.a1,t.a2,i])
tmp = pd.DataFrame(tmp,columns=['chrom','snp','cm','pos','a0','a1','i'])
tmp.snp = 'chr'+tmp[['chrom','pos','a0','a1']].astype(str).agg(':'.join, axis=1)
{% endraw %} {% raw %}
tmp
chrom snp cm pos a0 a1 i
0 5 rs546183826 0.0 10043 T A 0
1 5 rs564373080 0.0 10055 T A 1
2 5 rs528775171 0.0 10056 A C 2
3 5 rs547354230 0.0 10058 C A 3
4 5 rs562245928 0.0 10059 C A 4
... ... ... ... ... ... ... ...
6070636 5 rs552848768 0.0 180902794 G A 6070636
6070637 5 rs572660833 0.0 180902887 T A 6070637
6070638 5 rs544891279 0.0 180903491 G T 6070638
6070639 5 rs558980847 0.0 180904360 A T 6070639
6070640 5 rs575676143 0.0 180904689 T C 6070640

6070641 rows × 7 columns

{% endraw %} {% raw %}
list(bgen.iter_variant_info())[0]
<Variant rs546183826 chr05:10043_T/A>
{% endraw %} {% raw %}
idx = imput_geno.idx
{% endraw %} {% raw %}
if type(list(idx)[0]) is bool:
    pd_idx = pd.Series(idx)
    idx = list(pd_idx[pd_idx].index)
{% endraw %} {% raw %}
len(idx)
8470
{% endraw %} {% raw %}
idx[1:10]
[10535, 10541, 10542, 10556, 10567, 10569, 10570, 10573, 10574]
{% endraw %} {% raw %}
imp_geno_path = 'MWE_region_extraction/ukb_imp_chr5_v3_05_272856_1213643.bgen'
bgen = open_bgen(imp_geno_path)
bgen.read(1)
reading -- time=0:00:00.00, thread 1 of 1, part 1 of 1
array([[[1., 0., 0.]],

       [[1., 0., 0.]],

       [[1., 0., 0.]],

       ...,

       [[1., 0., 0.]],

       [[1., 0., 0.]],

       [[1., 0., 0.]]])
{% endraw %} {% raw %}
imp_geno_path = '/mnt/mfs/statgen/archive/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr5_v3.bgen'
bgen = open_bgen(imp_geno_path)
bgen.read(1)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/tmp/1975928.1.all.q/ipykernel_31014/3592512439.py in <module>
      1 imp_geno_path = '/mnt/mfs/statgen/archive/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr5_v3.bgen'
      2 bgen = open_bgen(imp_geno_path)
----> 3 bgen.read(1)

~/miniconda3/lib/python3.8/site-packages/bgen_reader/_bgen2.py in read(self, index, dtype, order, max_combinations, return_probabilities, return_missings, return_ploidies, num_threads)
    529 
    530         max_combinations = (
--> 531             max_combinations if max_combinations is not None else self.max_combinations
    532         )  # Can't use 'or' because it treats 0 as False
    533 

~/miniconda3/lib/python3.8/site-packages/bgen_reader/_bgen2.py in max_combinations(self)
    711 
    712         """
--> 713         return self._metadata2_memmaps["max_combinations"][0]
    714 
    715     @property

~/miniconda3/lib/python3.8/site-packages/bgen_reader/_multimemmap.py in __getitem__(self, name)
    200 
    201     def __getitem__(self, name: str) -> np.memmap:
--> 202         return self._name_to_memmap[name]
    203 
    204     def append_empty(

KeyError: 'max_combinations'
{% endraw %} {% raw %}
imput_geno.bed
<bgen_reader._bgen2.open_bgen at 0x2b1a4e207e80>
{% endraw %} {% raw %}
bgen
<bgen_reader._bgen2.open_bgen at 0x2b1dfde98b20>
{% endraw %} {% raw %}
imput_geno.geno_in_stat(imput_sumstats.ss)
[False, False, False, False, False, False, False, False, False, False]
8470 [10529, 10535, 10541, 10542, 10556, 10567, 10569, 10570, 10573, 10574]
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/tmp/1967998.1.high_mem.q/ipykernel_3740/3359055265.py in <module>
----> 1 imput_geno.geno_in_stat(imput_sumstats.ss)

/tmp/1967998.1.high_mem.q/ipykernel_3740/2365980871.py in geno_in_stat(self, stat, notin)
     21         '''The function to find an overlap region between geno data with sumstat'''
     22         variants = stat.SNP
---> 23         self.extractbyvariants(variants,notin)
     24 
     25 

/tmp/1967998.1.high_mem.q/ipykernel_3740/2365980871.py in extractbyvariants(self, variants, notin)
     44             raise ValueError('The extraction is empty')
     45         #update bim,bed
---> 46         self.extractbyidx(idx,row=True)
     47 
     48     def extractbysamples(self,samples,notin=False): #samples is list or pd.Series

/tmp/1967998.1.high_mem.q/ipykernel_3740/2365980871.py in extractbyidx(self, idx, row)
     74             else:
     75                 self.fam = self.fam.iloc[idx]
---> 76         self.bed = extract_bed(self.bed,idx,row)
     77 

/tmp/1967998.1.high_mem.q/ipykernel_3740/2305036667.py in extract_bed(geno, idx, row, step)
     13                 pd_idx = pd.Series(idx)
     14                 idx = list(pd_idx[pd_idx].index)
---> 15             geno = bgen2dask(geno,idx,step)
     16         else:
     17             geno = geno.read() # read all variants

/tmp/1967998.1.high_mem.q/ipykernel_3740/738061970.py in bgen2dask(bgen, index, step)
      6     print(n,index[:10])
      7     for i in range(0,n,step):
----> 8         onecode_geno = bgen.read(index[i:min(n,i+step)])  #samples x variants
      9         geno = onecode_geno.argmax(axis=2).astype(np.int8)
     10         genos.append(da.from_array(geno))

~/miniconda3/lib/python3.8/site-packages/bgen_reader/_bgen2.py in read(self, index, dtype, order, max_combinations, return_probabilities, return_missings, return_ploidies, num_threads)
    529 
    530         max_combinations = (
--> 531             max_combinations if max_combinations is not None else self.max_combinations
    532         )  # Can't use 'or' because it treats 0 as False
    533 

~/miniconda3/lib/python3.8/site-packages/bgen_reader/_bgen2.py in max_combinations(self)
    711 
    712         """
--> 713         return self._metadata2_memmaps["max_combinations"][0]
    714 
    715     @property

~/miniconda3/lib/python3.8/site-packages/bgen_reader/_multimemmap.py in __getitem__(self, name)
    200 
    201     def __getitem__(self, name: str) -> np.memmap:
--> 202         return self._name_to_memmap[name]
    203 
    204     def append_empty(

KeyError: 'max_combinations'
{% endraw %} {% raw %}
read_bgen('/mnt/mfs/statgen/archive/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr5_v3.bgen')
(         chrom                 snp        pos a0 a1
 0            5      chr5:10043:T:A      10043  T  A
 1            5      chr5:10055:T:A      10055  T  A
 2            5      chr5:10056:A:C      10056  A  C
 3            5      chr5:10058:C:A      10058  C  A
 4            5      chr5:10059:C:A      10059  C  A
 ...        ...                 ...        ... .. ..
 6070636      5  chr5:180902794:G:A  180902794  G  A
 6070637      5  chr5:180902887:T:A  180902887  T  A
 6070638      5  chr5:180903491:G:T  180903491  G  T
 6070639      5  chr5:180904360:A:T  180904360  A  T
 6070640      5  chr5:180904689:T:C  180904689  T  C
 
 [6070641 rows x 5 columns],
 None,
 <bgen_reader._bgen2.open_bgen at 0x2b1a4e295c70>)
{% endraw %}