msa-experiment/02_prepare_documents.py at master · pulseandthread/msa-experiment · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
Step 2: Convert clean conversation history into MSA documents.
Each document = one conversation turn (user message + assistant response).
Autonomous/unprompted assistant messages become standalone documents.
"""
import json
from pathlib import Path

PROJECT_DIR = Path(__file__).parent
DATASET_FILE = PROJECT_DIR / "dataset" / "conversation_complete.json"
OUTPUT_FILE = PROJECT_DIR / "dataset" / "documents.json"


def build_documents():
    print("Loading conversation dataset...")
    with open(DATASET_FILE, 'r', encoding='utf-8') as f:
        data = json.load(f)

    messages = data['conversation']
    print(f"Total messages: {len(messages)}")

    documents = []
    doc_id = 0
    i = 0

    while i < len(messages):
        msg = messages[i]

        if msg['role'] == 'user':
            # Standard turn: user + assistant pair
            user_content = msg['content']
            user_ts = msg.get('timestamp', '')

            if i + 1 < len(messages) and messages[i + 1]['role'] == 'assistant':
                assistant_content = messages[i + 1]['content']
                assistant_ts = messages[i + 1].get('timestamp', '')

                doc = {
                    'doc_id': doc_id,
                    'text': f"User: {user_content}\nAssistant: {assistant_content}",
                    'timestamp': user_ts or assistant_ts,
                    'type': 'conversation',
                    'char_count': len(user_content) + len(assistant_content),
                }
                documents.append(doc)
                doc_id += 1
                i += 2
            else:
                # Orphan user message (no response)
                doc = {
                    'doc_id': doc_id,
                    'text': f"User: {user_content}",
                    'timestamp': user_ts,
                    'type': 'orphan_user',
                    'char_count': len(user_content),
                }
                documents.append(doc)
                doc_id += 1
                i += 1

        elif msg['role'] == 'assistant':
            # Standalone assistant message (autonomous/unprompted)
            assistant_content = msg['content']
            assistant_ts = msg.get('timestamp', '')
            is_autonomous = msg.get('autonomous', False)

            doc = {
                'doc_id': doc_id,
                'text': f"Assistant (unprompted): {assistant_content}",
                'timestamp': assistant_ts,
                'type': 'autonomous' if is_autonomous else 'unprompted',
                'char_count': len(assistant_content),
            }
            documents.append(doc)
            doc_id += 1
            i += 1
        else:
            i += 1

    # Stats
    total_chars = sum(d['char_count'] for d in documents)
    avg_chars = total_chars / len(documents) if documents else 0
    types = {}
    for d in documents:
        t = d['type']
        types[t] = types.get(t, 0) + 1

    timestamps = [d['timestamp'][:10] for d in documents if d.get('timestamp')]
    earliest = min(timestamps) if timestamps else 'unknown'
    latest = max(timestamps) if timestamps else 'unknown'

    output = {
        'metadata': {
            'total_documents': len(documents),
            'total_characters': total_chars,
            'estimated_tokens': total_chars // 4,
            'avg_chars_per_doc': int(avg_chars),
            'avg_tokens_per_doc': int(avg_chars // 4),
            'date_range': f"{earliest} to {latest}",
            'document_types': types,
        },
        'documents': documents,
    }

    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

    print(f"\n{'='*60}")
    print(f"DOCUMENTS PREPARED")
    print(f"{'='*60}")
    print(f"Output: {OUTPUT_FILE}")
    print(f"Total documents: {len(documents):,}")
    print(f"Document types: {types}")
    print(f"Estimated tokens: {total_chars // 4:,}")
    print(f"Avg tokens/doc: {int(avg_chars // 4)}")
    print(f"Date range: {earliest} to {latest}")


if __name__ == '__main__':
    build_documents()