From a7f526be8a2efd20bbb795edf71eac2b2d000826 Mon Sep 17 00:00:00 2001 From: Azeem Fidahusein Date: Sun, 31 Aug 2025 16:36:04 +0100 Subject: [PATCH] moved overview script --- scripts/{ => overview}/data-overview.py | 150 +++++++++++++++++++++++- 1 file changed, 148 insertions(+), 2 deletions(-) rename scripts/{ => overview}/data-overview.py (64%) diff --git a/scripts/data-overview.py b/scripts/overview/data-overview.py similarity index 64% rename from scripts/data-overview.py rename to scripts/overview/data-overview.py index 8ca4fc7..544235f 100644 --- a/scripts/data-overview.py +++ b/scripts/overview/data-overview.py @@ -178,6 +178,134 @@ def analyze_timeline_data(timeline_path: str) -> Dict[str, Any]: return stats +def export_statistics_to_file(stats: Dict[str, Any], output_path: str): + """Export comprehensive statistics to a text file.""" + with open(output_path, 'w', encoding='utf-8') as f: + # Redirect print statements to file + original_stdout = sys.stdout + sys.stdout = f + + print_statistics(stats) + + # Restore stdout + sys.stdout = original_stdout + +def analyze_json_structure(timeline_path: str, output_path: str): + """Analyze and export the JSON structure to a text file.""" + print(f"Analyzing JSON structure from: {timeline_path}") + + with open(timeline_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + def explore_structure(obj, path="", depth=0, max_depth=4): + """Recursively explore JSON structure.""" + indent = " " * depth + structure_info = [] + + if depth > max_depth: + return [f"{indent}... (max depth reached)"] + + if isinstance(obj, dict): + structure_info.append(f"{indent}{path} (dict) - {len(obj)} keys:") + for key, value in list(obj.items())[:10]: # Limit to first 10 keys + key_path = f"{path}.{key}" if path else key + if isinstance(value, (dict, list)): + structure_info.extend(explore_structure(value, key_path, depth + 1, max_depth)) + else: + value_type = type(value).__name__ + if isinstance(value, str) and len(value) > 50: + sample = value[:50] + "..." + else: + sample = str(value) + structure_info.append(f"{indent} {key}: {value_type} = {sample}") + + if len(obj) > 10: + structure_info.append(f"{indent} ... and {len(obj) - 10} more keys") + + elif isinstance(obj, list): + structure_info.append(f"{indent}{path} (list) - {len(obj)} items:") + if obj: + structure_info.append(f"{indent} Sample item structure:") + structure_info.extend(explore_structure(obj[0], f"{path}[0]", depth + 1, max_depth)) + if len(obj) > 1: + structure_info.append(f"{indent} ... and {len(obj) - 1} more items") + else: + value_type = type(obj).__name__ + structure_info.append(f"{indent}{path}: {value_type} = {obj}") + + return structure_info + + with open(output_path, 'w', encoding='utf-8') as f: + f.write("="*80 + "\n") + f.write("TIMELINE JSON STRUCTURE ANALYSIS\n") + f.write("="*80 + "\n\n") + + f.write(f"File: {timeline_path}\n") + f.write(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") + + # Overall structure + f.write("ROOT LEVEL STRUCTURE:\n") + f.write("-" * 40 + "\n") + for key, value in data.items(): + if isinstance(value, list): + f.write(f"{key}: list with {len(value)} items\n") + elif isinstance(value, dict): + f.write(f"{key}: dict with {len(value)} keys\n") + else: + f.write(f"{key}: {type(value).__name__} = {value}\n") + + f.write("\n" + "="*80 + "\n") + f.write("DETAILED STRUCTURE:\n") + f.write("="*80 + "\n\n") + + # Detailed structure analysis + structure_lines = explore_structure(data) + for line in structure_lines: + f.write(line + "\n") + + # Sample semantic segment analysis + semantic_segments = data.get('semanticSegments', []) + if semantic_segments: + f.write("\n" + "="*80 + "\n") + f.write("SEMANTIC SEGMENTS ANALYSIS:\n") + f.write("="*80 + "\n\n") + + f.write(f"Total semantic segments: {len(semantic_segments)}\n\n") + + # Analyze different types of segments + visit_count = sum(1 for seg in semantic_segments if 'visit' in seg) + path_count = sum(1 for seg in semantic_segments if 'timelinePath' in seg) + + f.write(f"Segments with visits: {visit_count}\n") + f.write(f"Segments with timeline paths: {path_count}\n\n") + + # Sample visit structure + sample_visit = None + sample_path = None + + for segment in semantic_segments[:100]: # Check first 100 segments + if 'visit' in segment and sample_visit is None: + sample_visit = segment + if 'timelinePath' in segment and sample_path is None: + sample_path = segment + if sample_visit and sample_path: + break + + if sample_visit: + f.write("SAMPLE VISIT STRUCTURE:\n") + f.write("-" * 40 + "\n") + visit_structure = explore_structure(sample_visit, "sample_visit") + for line in visit_structure: + f.write(line + "\n") + f.write("\n") + + if sample_path: + f.write("SAMPLE TIMELINE PATH STRUCTURE:\n") + f.write("-" * 40 + "\n") + path_structure = explore_structure(sample_path, "sample_timelinePath") + for line in path_structure: + f.write(line + "\n") + def print_statistics(stats: Dict[str, Any]): """Print comprehensive statistics in a readable format.""" print("\n" + "="*80) @@ -277,14 +405,32 @@ def main(): print(f"Searched in: {repo_root}") sys.exit(1) + # Generate output file names with timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + stats_output_path = os.path.join(repo_root, f"timeline_statistics_{timestamp}.txt") + structure_output_path = os.path.join(repo_root, f"timeline_structure_{timestamp}.txt") + try: + # Analyze the JSON structure first + print("šŸ“‹ Analyzing JSON structure...") + analyze_json_structure(timeline_path, structure_output_path) + print(f"āœ… JSON structure exported to: {structure_output_path}") + # Analyze the data stats = analyze_timeline_data(timeline_path) - # Print the results + # Print the results to console print_statistics(stats) - print(f"\nāœ… Analysis complete! File analyzed: {timeline_path}") + # Export statistics to file + print(f"\nšŸ“„ Exporting statistics to file...") + export_statistics_to_file(stats, stats_output_path) + print(f"āœ… Statistics exported to: {stats_output_path}") + + print(f"\nšŸŽ‰ Analysis complete!") + print(f"šŸ“Š Statistics file: {stats_output_path}") + print(f"šŸ—ļø Structure file: {structure_output_path}") + print(f"šŸ“ Source file: {timeline_path}") except FileNotFoundError: print(f"āŒ Error: Could not find Timeline.json at {timeline_path}")