journal_formatting/ris_parser.py at main · g-pachakis/journal_formatting · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
"""
RIS file parser.

Parses .ris bibliography files into structured reference dicts
for citation matching and formatting.
"""

import re


def parse_ris(path):
    """Parse a .ris file and return a list of reference dicts.

    Each dict contains:
        type: str (JOUR, BOOK, CHAP, CONF, etc.)
        authors: list[str] (e.g., ['Shannon, Claude E.', 'Weaver, Warren'])
        title: str
        journal: str (for JOUR) or book_title for CHAP
        year: str
        volume: str
        issue: str
        start_page: str
        end_page: str
        doi: str
        publisher: str
        place: str
        edition: str
        editors: list[str]
        url: str
        keywords: list[str]
    """
    with open(path, 'r', encoding='utf-8-sig') as f:
        content = f.read()

    records = []
    current = None

    for line in content.splitlines():
        line = line.rstrip()
        if not line:
            continue

        # Match RIS tag: two uppercase letters, two spaces, hyphen, space, value
        m = re.match(r'^([A-Z][A-Z0-9])\s{2}-\s?(.*)', line)
        if not m:
            # Continuation line — append to last field if possible
            if current and current.get('_last_tag'):
                tag = current['_last_tag']
                if tag in current:
                    if isinstance(current[tag], list):
                        current[tag][-1] += ' ' + line.strip()
                    else:
                        current[tag] += ' ' + line.strip()
            continue

        tag = m.group(1)
        value = m.group(2).strip()

        if tag == 'TY':
            current = {'TY': value, '_last_tag': 'TY'}
            continue

        if tag == 'ER':
            if current:
                del current['_last_tag']
                records.append(_normalize_record(current))
                current = None
            continue

        if current is None:
            continue

        current['_last_tag'] = tag

        # Repeatable fields stored as lists
        if tag in ('AU', 'A1', 'A2', 'A3', 'A4', 'ED', 'KW'):
            current.setdefault(tag, [])
            current[tag].append(value)
        else:
            current[tag] = value

    # Handle file without final ER
    if current:
        if '_last_tag' in current:
            del current['_last_tag']
        records.append(_normalize_record(current))

    return records


def _normalize_record(raw):
    """Convert raw RIS tag dict to a clean reference dict."""
    ref = {
        'type': raw.get('TY', 'GEN'),
        'authors': raw.get('AU', raw.get('A1', [])),
        'editors': raw.get('A2', raw.get('ED', [])),
        'title': raw.get('TI', raw.get('T1', '')),
        'journal': raw.get('T2', raw.get('JO', raw.get('JF', raw.get('JA', '')))),
        'year': '',
        'volume': raw.get('VL', ''),
        'issue': raw.get('IS', ''),
        'start_page': raw.get('SP', ''),
        'end_page': raw.get('EP', ''),
        'doi': raw.get('DO', ''),
        'publisher': raw.get('PB', ''),
        'place': raw.get('CY', raw.get('PP', '')),
        'edition': raw.get('ET', ''),
        'url': raw.get('UR', ''),
        'keywords': raw.get('KW', []),
        'isbn': raw.get('SN', ''),
        'abstract': raw.get('AB', raw.get('N2', '')),
    }

    # Extract year from PY or Y1 (format: YYYY/MM/DD/other or just YYYY)
    py = raw.get('PY', raw.get('Y1', raw.get('DA', '')))
    if py:
        year_match = re.match(r'(\d{4})', py)
        if year_match:
            ref['year'] = year_match.group(1)

    # For book types, title might be in BT
    if not ref['title'] and 'BT' in raw:
        ref['title'] = raw['BT']

    return ref


def match_citation_to_ris(citation_text, ris_records):
    """Match a hardwritten citation string to a RIS record.

    Uses fuzzy matching on author last names and year.
    Returns the matched RIS record or None.
    """
    citation_text = citation_text.strip()
    # Strip leading [N] number
    citation_text = re.sub(r'^\[?\d+\]?\s*', '', citation_text)

    for rec in ris_records:
        score = 0
        total = 0

        # Match year
        if rec['year']:
            total += 2
            if rec['year'] in citation_text:
                score += 2

        # Match author last names
        for author in rec['authors'][:3]:  # Check first 3 authors
            last_name = author.split(',')[0].strip()
            if last_name and len(last_name) > 2:
                total += 1
                if last_name in citation_text:
                    score += 1

        # Match DOI
        if rec['doi'] and rec['doi'] in citation_text:
            return rec

        # Match title words (at least 3 consecutive words)
        if rec['title']:
            title_words = rec['title'].split()
            if len(title_words) >= 3:
                # Check if 3+ consecutive title words appear in citation
                for i in range(len(title_words) - 2):
                    snippet = ' '.join(title_words[i:i+3]).lower()
                    if snippet.lower() in citation_text.lower():
                        score += 2
                        total += 2
                        break
                else:
                    total += 2

        # Need at least 60% match with minimum 2 points
        if total > 0 and score >= 2 and score / total >= 0.5:
            return rec

    return None