Coverage for kye/loader/json_lines.py: 22%
58 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-13 15:17 -0700
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-13 15:17 -0700
1import json
2from pathlib import Path
3from kye.dataset import Type, Edge, Models, TYPE_REF
4from typing import Any
5from duckdb import DuckDBPyConnection, DuckDBPyRelation
6import re
8DIR = Path(__file__).parent.parent.parent / 'data'
9DIR.mkdir(parents=True, exist_ok=True)
10assert DIR.is_dir()
12def normalize_value(typ: Type, data: Any):
13 if data is None:
14 return None
16 # TODO: reshape id maps { [id]: { ... } } to [ { id, ... } ]
17 # not sure if we want to do that auto-magically or have it explicitly
18 # defined as part of the schema
19 if typ.has_edges:
20 # TODO: better error handling, i.e trace location in data
21 # so that we can report the location of the error
22 assert type(data) is dict
24 edges = {}
25 for edge in typ:
26 if edge.name not in data:
27 continue
29 val = normalize_edge(edge, data.get(edge.name))
30 if val is not None:
31 edges[edge.name] = val
33 if typ.has_index:
34 missing_indexes = [key for key in typ.index if key not in edges]
35 assert len(missing_indexes) == 0, f'Missing indexes for {repr(typ)}: {",".join(missing_indexes)}'
37 if len(edges) == 0:
38 return None
40 return edges
42 assert type(data) is not dict
44 if type(data) is float:
45 return re.sub(r'\.0$', '', str(data))
47 return str(data)
49def normalize_values(typ: Type, data: Any):
50 if data is None:
51 return None
53 if type(data) is not list:
54 data = [ data ]
56 values = []
57 for item in data:
58 val = normalize_value(typ, item)
59 if val is not None:
60 values.append(val)
62 if len(values) == 0:
63 return None
65 return values
67def normalize_edge(edge: Edge, data: Any):
68 if data is None:
69 return None
71 if edge.multiple:
72 return normalize_values(edge.type, data)
74 assert type(data) is not list
75 return normalize_value(edge.type, data)
77def from_json(typ: Type, data: list[dict], con: DuckDBPyConnection) -> DuckDBPyRelation:
78 file_path = DIR / f'{typ.ref}.jsonl'
80 with file_path.open('w', encoding='utf-8') as f:
81 for row in normalize_values(typ, data):
82 json.dump(row, f)
83 f.write('\n')
85 return con.read_json(str(file_path))