Coverage for sfkit/protocol/register_data.py: 100%

133 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-08-07 15:11 -0400

1import os 

2from typing import Optional, Tuple 

3 

4import checksumdir 

5 

6from sfkit.api import get_doc_ref_dict, update_firestore 

7from sfkit.encryption.mpc.encrypt_data import encrypt_data 

8from sfkit.utils import constants 

9from sfkit.utils.helper_functions import authenticate_user, condition_or_fail 

10from sfkit.api import get_username, website_send_file 

11 

12 

13def register_data(geno_binary_file_prefix: str, data_path: str) -> bool: 

14 """ 

15 Register data with the server and validate that the data formatting looks correct. 

16 """ 

17 authenticate_user() 

18 

19 doc_ref_dict: dict = get_doc_ref_dict() 

20 username: str = get_username() 

21 role: str = str(doc_ref_dict["participants"].index(username)) 

22 study_type: str = doc_ref_dict["study_type"] 

23 

24 validated = "validated" in doc_ref_dict["status"][username] 

25 if not validated: 

26 if study_type == "SF-GWAS": 

27 if constants.BLOCKS_MODE not in doc_ref_dict["description"]: 

28 geno_binary_file_prefix, data_path = validate_sfgwas( 

29 doc_ref_dict, username, data_path, geno_binary_file_prefix 

30 ) 

31 elif study_type == "MPC-GWAS": 

32 data_path = validate_mpcgwas(doc_ref_dict, username, data_path, role) 

33 elif study_type == "PCA": 

34 data_path = validate_pca(doc_ref_dict, username, data_path) 

35 else: 

36 raise ValueError(f"Unknown study type: {study_type}") 

37 

38 update_firestore("update_firestore::status=validated data") 

39 

40 if constants.BLOCKS_MODE not in doc_ref_dict["description"]: 

41 data_hash = checksumdir.dirhash(data_path, "md5") 

42 update_firestore(f"update_firestore::DATA_HASH={data_hash}") 

43 

44 with open(os.path.join(constants.SFKIT_DIR, "data_path.txt"), "w") as f: 

45 if study_type == "SF-GWAS": 

46 f.write(geno_binary_file_prefix + "\n") 

47 f.write(data_path + "\n") 

48 

49 print("Successfully registered and validated data!") 

50 else: 

51 print("Data has already been validated; skipping validation step.") 

52 

53 encrypt_mpcgwas(role, study_type) 

54 

55 return True 

56 

57 

58def encrypt_mpcgwas(role: str, study_type: str) -> None: 

59 if study_type == "MPC-GWAS" and role in {"1", "2"}: 

60 print("Now encrypting data...") 

61 update_firestore("update_firestore::task=Encrypting data") 

62 try: 

63 encrypt_data() 

64 except Exception as e: 

65 condition_or_fail(False, f"encrypt_data::error={e}") 

66 

67 

68def validate_sfgwas( 

69 doc_ref_dict: dict, username: str, data_path: str, geno_binary_file_prefix: str 

70) -> Tuple[str, str]: 

71 geno_binary_file_prefix = validate_geno_binary_file_prefix(geno_binary_file_prefix) 

72 data_path = validate_data_path(data_path) 

73 

74 if data_path == "demo" or (constants.IS_DOCKER and doc_ref_dict["demo"]): 

75 using_demo() 

76 

77 num_inds: int = validate_sfgwas_data(geno_binary_file_prefix, data_path) 

78 num_inds_value = doc_ref_dict["personal_parameters"][username]["NUM_INDS"]["value"] 

79 if num_inds_value == "": 

80 condition_or_fail(False, "NUM_INDS is not set. Please set it and try again.") 

81 else: 

82 condition_or_fail( 

83 num_inds == int(num_inds_value), 

84 "NUM_INDS does not match the number of individuals in the data.", 

85 ) 

86 num_snps: int = num_rows(os.path.join(data_path, "snp_ids.txt")) 

87 condition_or_fail( 

88 num_snps == int(doc_ref_dict["parameters"]["num_snps"]["value"]), 

89 "num_snps does not match the number of SNPs in the data.", 

90 ) 

91 print(f"Your data has {num_inds} individuals and {num_snps} SNPs.") 

92 

93 return geno_binary_file_prefix, data_path 

94 

95 

96def validate_mpcgwas(doc_ref_dict: dict, username: str, data_path: str, role: str) -> str: 

97 data_path = validate_data_path(data_path) 

98 

99 if data_path == "demo" or (constants.IS_DOCKER and doc_ref_dict["demo"]): 

100 using_demo() 

101 

102 num_inds, num_covs = validate_mpcgwas_data(data_path) 

103 condition_or_fail( 

104 num_inds == int(doc_ref_dict["personal_parameters"][username]["NUM_INDS"]["value"]), 

105 "NUM_INDS does not match the number of individuals in the data.", 

106 ) 

107 condition_or_fail( 

108 num_covs == int(doc_ref_dict["parameters"]["NUM_COVS"]["value"]), 

109 "NUM_COVS does not match the number of covariates in the data.", 

110 ) 

111 

112 print(f"Your data has {num_inds} individuals and {num_covs} covariates.") 

113 

114 if role == "1": 

115 website_send_file(open(os.path.join(data_path, "pos.txt"), "r"), "pos.txt") 

116 

117 return data_path 

118 

119 

120def validate_pca(doc_ref_dict: dict, username: str, data_path: str) -> str: 

121 data_path = validate_data_path(data_path) 

122 

123 if data_path == "demo" or (constants.IS_DOCKER and doc_ref_dict["demo"]): 

124 using_demo() 

125 

126 number_of_rows: int = num_rows(os.path.join(data_path, "data.txt")) 

127 condition_or_fail( 

128 number_of_rows == int(doc_ref_dict["personal_parameters"][username]["NUM_INDS"]["value"]), 

129 "NUM_INDS does not match the number of rows in the data.", 

130 ) 

131 number_of_cols: int = num_cols(os.path.join(data_path, "data.txt")) 

132 condition_or_fail( 

133 number_of_cols == int(doc_ref_dict["parameters"]["num_columns"]["value"]), 

134 "num_columns does not match the number of columns in the data.", 

135 ) 

136 print(f"Your data has {number_of_rows} rows and {number_of_cols} columns.") 

137 

138 return data_path 

139 

140 

141def validate_geno_binary_file_prefix(geno_binary_file_prefix: str) -> str: 

142 if not geno_binary_file_prefix: 

143 if constants.IS_DOCKER and os.path.exists("/app/data/geno"): 

144 geno_binary_file_prefix = f"/app/data/geno/ch%d" 

145 print(f"Using default geno_binary_file_prefix for docker: {geno_binary_file_prefix}") 

146 else: 

147 geno_binary_file_prefix = input( 

148 f"Enter absolute path to geno binary file prefix (e.g. '/home/username/for_sfgwas/geno/ch%d'): " 

149 ) # sourcery skip: remove-redundant-fstring 

150 if geno_binary_file_prefix != "demo" and not os.path.isabs(geno_binary_file_prefix): 

151 print("I need an ABSOLUTE path for the geno_binary_file_prefix.") 

152 exit(1) 

153 return geno_binary_file_prefix 

154 

155 

156def validate_data_path(data_path: str) -> str: 

157 if not data_path: 

158 if constants.IS_DOCKER and os.path.exists("/app/data"): 

159 data_path = "/app/data" 

160 print(f"Using default data_path for docker: {data_path}") 

161 else: 

162 data_path = input("Enter the (absolute) path to your data files (e.g. /home/username/for_sfgwas): ") 

163 if data_path != "demo" and not os.path.isabs(data_path): 

164 print("I need an ABSOLUTE path for the data_path.") 

165 exit(1) 

166 return data_path 

167 

168 

169def validate_sfgwas_data(geno_binary_file_prefix: str, data_path: str) -> int: 

170 for suffix in ["pgen", "pvar", "psam"]: 

171 condition_or_fail( 

172 os.path.isfile(geno_binary_file_prefix % 1 + "." + suffix), 

173 f"Could not find {geno_binary_file_prefix % 1}.{suffix} file.", 

174 ) 

175 

176 rows: int = num_rows(os.path.join(data_path, "pheno.txt")) 

177 condition_or_fail( 

178 rows == num_rows(os.path.join(data_path, "cov.txt")), "pheno and cov have different number of rows" 

179 ) 

180 condition_or_fail( 

181 rows == num_rows(os.path.join(data_path, "sample_keep.txt")), "pheno and sample_keep differ in num-rows" 

182 ) 

183 

184 duplicate_line = find_duplicate_line(os.path.join(data_path, "snp_ids.txt")) 

185 condition_or_fail(duplicate_line is None, f"snp_ids.txt has duplicate line: {duplicate_line}") 

186 

187 return rows 

188 

189 

190def validate_mpcgwas_data(data_path: str) -> Tuple[int, int]: 

191 rows = num_rows(os.path.join(data_path, "cov.txt")) 

192 condition_or_fail( 

193 rows == num_rows(os.path.join(data_path, "geno.txt")), "cov and geno have different number of rows" 

194 ) 

195 condition_or_fail( 

196 rows == num_rows(os.path.join(data_path, "pheno.txt")), "cov and pheno have different number of rows" 

197 ) 

198 num_covs = num_cols(os.path.join(data_path, "cov.txt")) 

199 

200 duplicate_line = find_duplicate_line(os.path.join(data_path, "pos.txt")) 

201 condition_or_fail(duplicate_line is None, f"pos.txt has duplicate line: {duplicate_line}") 

202 

203 return rows, num_covs 

204 

205 

206def num_rows(file_path: str) -> int: 

207 return sum(1 for _ in open(file_path)) 

208 

209 

210def num_cols(file_path: str) -> int: 

211 return len(open(file_path).readline().split()) 

212 

213 

214def using_demo() -> None: 

215 update_firestore("update_firestore::status=validated data") 

216 print("Using demo data!") 

217 print("Successfully registered and validated data!") 

218 exit(0) 

219 

220 

221def find_duplicate_line(filename: str) -> Optional[str]: 

222 with open(filename, "r") as file: 

223 prev_line = None 

224 for line in file: 

225 if prev_line and line.strip() == prev_line.strip(): 

226 return prev_line.strip() 

227 prev_line = line 

228 return None