Coverage for sfkit/protocol/register_data.py: 100%
133 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-07 15:11 -0400
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-07 15:11 -0400
1import os
2from typing import Optional, Tuple
4import checksumdir
6from sfkit.api import get_doc_ref_dict, update_firestore
7from sfkit.encryption.mpc.encrypt_data import encrypt_data
8from sfkit.utils import constants
9from sfkit.utils.helper_functions import authenticate_user, condition_or_fail
10from sfkit.api import get_username, website_send_file
13def register_data(geno_binary_file_prefix: str, data_path: str) -> bool:
14 """
15 Register data with the server and validate that the data formatting looks correct.
16 """
17 authenticate_user()
19 doc_ref_dict: dict = get_doc_ref_dict()
20 username: str = get_username()
21 role: str = str(doc_ref_dict["participants"].index(username))
22 study_type: str = doc_ref_dict["study_type"]
24 validated = "validated" in doc_ref_dict["status"][username]
25 if not validated:
26 if study_type == "SF-GWAS":
27 if constants.BLOCKS_MODE not in doc_ref_dict["description"]:
28 geno_binary_file_prefix, data_path = validate_sfgwas(
29 doc_ref_dict, username, data_path, geno_binary_file_prefix
30 )
31 elif study_type == "MPC-GWAS":
32 data_path = validate_mpcgwas(doc_ref_dict, username, data_path, role)
33 elif study_type == "PCA":
34 data_path = validate_pca(doc_ref_dict, username, data_path)
35 else:
36 raise ValueError(f"Unknown study type: {study_type}")
38 update_firestore("update_firestore::status=validated data")
40 if constants.BLOCKS_MODE not in doc_ref_dict["description"]:
41 data_hash = checksumdir.dirhash(data_path, "md5")
42 update_firestore(f"update_firestore::DATA_HASH={data_hash}")
44 with open(os.path.join(constants.SFKIT_DIR, "data_path.txt"), "w") as f:
45 if study_type == "SF-GWAS":
46 f.write(geno_binary_file_prefix + "\n")
47 f.write(data_path + "\n")
49 print("Successfully registered and validated data!")
50 else:
51 print("Data has already been validated; skipping validation step.")
53 encrypt_mpcgwas(role, study_type)
55 return True
58def encrypt_mpcgwas(role: str, study_type: str) -> None:
59 if study_type == "MPC-GWAS" and role in {"1", "2"}:
60 print("Now encrypting data...")
61 update_firestore("update_firestore::task=Encrypting data")
62 try:
63 encrypt_data()
64 except Exception as e:
65 condition_or_fail(False, f"encrypt_data::error={e}")
68def validate_sfgwas(
69 doc_ref_dict: dict, username: str, data_path: str, geno_binary_file_prefix: str
70) -> Tuple[str, str]:
71 geno_binary_file_prefix = validate_geno_binary_file_prefix(geno_binary_file_prefix)
72 data_path = validate_data_path(data_path)
74 if data_path == "demo" or (constants.IS_DOCKER and doc_ref_dict["demo"]):
75 using_demo()
77 num_inds: int = validate_sfgwas_data(geno_binary_file_prefix, data_path)
78 num_inds_value = doc_ref_dict["personal_parameters"][username]["NUM_INDS"]["value"]
79 if num_inds_value == "":
80 condition_or_fail(False, "NUM_INDS is not set. Please set it and try again.")
81 else:
82 condition_or_fail(
83 num_inds == int(num_inds_value),
84 "NUM_INDS does not match the number of individuals in the data.",
85 )
86 num_snps: int = num_rows(os.path.join(data_path, "snp_ids.txt"))
87 condition_or_fail(
88 num_snps == int(doc_ref_dict["parameters"]["num_snps"]["value"]),
89 "num_snps does not match the number of SNPs in the data.",
90 )
91 print(f"Your data has {num_inds} individuals and {num_snps} SNPs.")
93 return geno_binary_file_prefix, data_path
96def validate_mpcgwas(doc_ref_dict: dict, username: str, data_path: str, role: str) -> str:
97 data_path = validate_data_path(data_path)
99 if data_path == "demo" or (constants.IS_DOCKER and doc_ref_dict["demo"]):
100 using_demo()
102 num_inds, num_covs = validate_mpcgwas_data(data_path)
103 condition_or_fail(
104 num_inds == int(doc_ref_dict["personal_parameters"][username]["NUM_INDS"]["value"]),
105 "NUM_INDS does not match the number of individuals in the data.",
106 )
107 condition_or_fail(
108 num_covs == int(doc_ref_dict["parameters"]["NUM_COVS"]["value"]),
109 "NUM_COVS does not match the number of covariates in the data.",
110 )
112 print(f"Your data has {num_inds} individuals and {num_covs} covariates.")
114 if role == "1":
115 website_send_file(open(os.path.join(data_path, "pos.txt"), "r"), "pos.txt")
117 return data_path
120def validate_pca(doc_ref_dict: dict, username: str, data_path: str) -> str:
121 data_path = validate_data_path(data_path)
123 if data_path == "demo" or (constants.IS_DOCKER and doc_ref_dict["demo"]):
124 using_demo()
126 number_of_rows: int = num_rows(os.path.join(data_path, "data.txt"))
127 condition_or_fail(
128 number_of_rows == int(doc_ref_dict["personal_parameters"][username]["NUM_INDS"]["value"]),
129 "NUM_INDS does not match the number of rows in the data.",
130 )
131 number_of_cols: int = num_cols(os.path.join(data_path, "data.txt"))
132 condition_or_fail(
133 number_of_cols == int(doc_ref_dict["parameters"]["num_columns"]["value"]),
134 "num_columns does not match the number of columns in the data.",
135 )
136 print(f"Your data has {number_of_rows} rows and {number_of_cols} columns.")
138 return data_path
141def validate_geno_binary_file_prefix(geno_binary_file_prefix: str) -> str:
142 if not geno_binary_file_prefix:
143 if constants.IS_DOCKER and os.path.exists("/app/data/geno"):
144 geno_binary_file_prefix = f"/app/data/geno/ch%d"
145 print(f"Using default geno_binary_file_prefix for docker: {geno_binary_file_prefix}")
146 else:
147 geno_binary_file_prefix = input(
148 f"Enter absolute path to geno binary file prefix (e.g. '/home/username/for_sfgwas/geno/ch%d'): "
149 ) # sourcery skip: remove-redundant-fstring
150 if geno_binary_file_prefix != "demo" and not os.path.isabs(geno_binary_file_prefix):
151 print("I need an ABSOLUTE path for the geno_binary_file_prefix.")
152 exit(1)
153 return geno_binary_file_prefix
156def validate_data_path(data_path: str) -> str:
157 if not data_path:
158 if constants.IS_DOCKER and os.path.exists("/app/data"):
159 data_path = "/app/data"
160 print(f"Using default data_path for docker: {data_path}")
161 else:
162 data_path = input("Enter the (absolute) path to your data files (e.g. /home/username/for_sfgwas): ")
163 if data_path != "demo" and not os.path.isabs(data_path):
164 print("I need an ABSOLUTE path for the data_path.")
165 exit(1)
166 return data_path
169def validate_sfgwas_data(geno_binary_file_prefix: str, data_path: str) -> int:
170 for suffix in ["pgen", "pvar", "psam"]:
171 condition_or_fail(
172 os.path.isfile(geno_binary_file_prefix % 1 + "." + suffix),
173 f"Could not find {geno_binary_file_prefix % 1}.{suffix} file.",
174 )
176 rows: int = num_rows(os.path.join(data_path, "pheno.txt"))
177 condition_or_fail(
178 rows == num_rows(os.path.join(data_path, "cov.txt")), "pheno and cov have different number of rows"
179 )
180 condition_or_fail(
181 rows == num_rows(os.path.join(data_path, "sample_keep.txt")), "pheno and sample_keep differ in num-rows"
182 )
184 duplicate_line = find_duplicate_line(os.path.join(data_path, "snp_ids.txt"))
185 condition_or_fail(duplicate_line is None, f"snp_ids.txt has duplicate line: {duplicate_line}")
187 return rows
190def validate_mpcgwas_data(data_path: str) -> Tuple[int, int]:
191 rows = num_rows(os.path.join(data_path, "cov.txt"))
192 condition_or_fail(
193 rows == num_rows(os.path.join(data_path, "geno.txt")), "cov and geno have different number of rows"
194 )
195 condition_or_fail(
196 rows == num_rows(os.path.join(data_path, "pheno.txt")), "cov and pheno have different number of rows"
197 )
198 num_covs = num_cols(os.path.join(data_path, "cov.txt"))
200 duplicate_line = find_duplicate_line(os.path.join(data_path, "pos.txt"))
201 condition_or_fail(duplicate_line is None, f"pos.txt has duplicate line: {duplicate_line}")
203 return rows, num_covs
206def num_rows(file_path: str) -> int:
207 return sum(1 for _ in open(file_path))
210def num_cols(file_path: str) -> int:
211 return len(open(file_path).readline().split())
214def using_demo() -> None:
215 update_firestore("update_firestore::status=validated data")
216 print("Using demo data!")
217 print("Successfully registered and validated data!")
218 exit(0)
221def find_duplicate_line(filename: str) -> Optional[str]:
222 with open(filename, "r") as file:
223 prev_line = None
224 for line in file:
225 if prev_line and line.strip() == prev_line.strip():
226 return prev_line.strip()
227 prev_line = line
228 return None