Coverage for kye/loader/json_lines.py: 22%

58 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-13 15:17 -0700

1import json 

2from pathlib import Path 

3from kye.dataset import Type, Edge, Models, TYPE_REF 

4from typing import Any 

5from duckdb import DuckDBPyConnection, DuckDBPyRelation 

6import re 

7 

8DIR = Path(__file__).parent.parent.parent / 'data' 

9DIR.mkdir(parents=True, exist_ok=True) 

10assert DIR.is_dir() 

11 

12def normalize_value(typ: Type, data: Any): 

13 if data is None: 

14 return None 

15 

16 # TODO: reshape id maps { [id]: { ... } } to [ { id, ... } ] 

17 # not sure if we want to do that auto-magically or have it explicitly 

18 # defined as part of the schema 

19 if typ.has_edges: 

20 # TODO: better error handling, i.e trace location in data 

21 # so that we can report the location of the error 

22 assert type(data) is dict 

23 

24 edges = {} 

25 for edge in typ: 

26 if edge.name not in data: 

27 continue 

28 

29 val = normalize_edge(edge, data.get(edge.name)) 

30 if val is not None: 

31 edges[edge.name] = val 

32 

33 if typ.has_index: 

34 missing_indexes = [key for key in typ.index if key not in edges] 

35 assert len(missing_indexes) == 0, f'Missing indexes for {repr(typ)}: {",".join(missing_indexes)}' 

36 

37 if len(edges) == 0: 

38 return None 

39 

40 return edges 

41 

42 assert type(data) is not dict 

43 

44 if type(data) is float: 

45 return re.sub(r'\.0$', '', str(data)) 

46 

47 return str(data) 

48 

49def normalize_values(typ: Type, data: Any): 

50 if data is None: 

51 return None 

52 

53 if type(data) is not list: 

54 data = [ data ] 

55 

56 values = [] 

57 for item in data: 

58 val = normalize_value(typ, item) 

59 if val is not None: 

60 values.append(val) 

61 

62 if len(values) == 0: 

63 return None 

64 

65 return values 

66 

67def normalize_edge(edge: Edge, data: Any): 

68 if data is None: 

69 return None 

70 

71 if edge.multiple: 

72 return normalize_values(edge.type, data) 

73 

74 assert type(data) is not list 

75 return normalize_value(edge.type, data) 

76 

77def from_json(typ: Type, data: list[dict], con: DuckDBPyConnection) -> DuckDBPyRelation: 

78 file_path = DIR / f'{typ.ref}.jsonl' 

79 

80 with file_path.open('w', encoding='utf-8') as f: 

81 for row in normalize_values(typ, data): 

82 json.dump(row, f) 

83 f.write('\n') 

84 

85 return con.read_json(str(file_path))