-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreference_engine.py
More file actions
139 lines (113 loc) · 4.62 KB
/
reference_engine.py
File metadata and controls
139 lines (113 loc) · 4.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
Reference resolution engine.
Orchestrates the full pipeline: extract DOIs, match against RIS,
optionally look up via CrossRef, and format in the target style.
"""
import re
from dataclasses import dataclass, field
from crossref_client import extract_dois, lookup_doi, search_reference
from ris_parser import match_citation_to_ris
from citation_formatter import format_reference_mdpi_runs
@dataclass
class ResolvedReference:
"""A reference with resolved metadata and formatted output."""
index: int
original_text: str
metadata: dict = field(default_factory=dict)
source: str = 'unmatched' # 'ris', 'crossref_doi', 'crossref_search', 'unmatched'
doi: str = ''
formatted_runs: list = field(default_factory=list)
def extract_ref_number(text):
"""Extract the reference number from text like '[1]' or '1.' at the start."""
m = re.match(r'^\[?(\d+)\]?\.?\s*', text)
return int(m.group(1)) if m else None
def resolve_references(ref_items, ris_data=None, use_crossref=False,
progress_callback=None):
"""Resolve all references through the matching pipeline.
Args:
ref_items: list of reader items where type == 'reference'
ris_data: optional list of RIS records from ris_parser.parse_ris()
use_crossref: whether to use CrossRef API for unmatched references
progress_callback: optional callable(current, total, message) for GUI
Returns:
list of ResolvedReference objects
"""
results = []
total = len(ref_items)
for i, item in enumerate(ref_items):
text = item['text']
ref_num = extract_ref_number(text) or (i + 1)
resolved = ResolvedReference(
index=ref_num,
original_text=text,
)
# Step 1: Extract DOIs from reference text
dois = extract_dois(text)
if dois:
resolved.doi = dois[0]
# Step 2: Match against RIS data (if provided)
if ris_data:
# Try DOI match first (most reliable)
if resolved.doi:
for rec in ris_data:
if rec.get('doi') and rec['doi'].lower() == resolved.doi.lower():
resolved.metadata = rec
resolved.source = 'ris'
break
# Fall back to fuzzy text matching
if not resolved.metadata:
match = match_citation_to_ris(text, ris_data)
if match:
resolved.metadata = match
resolved.source = 'ris'
# Step 3: CrossRef lookup (if enabled and still unmatched)
if use_crossref and not resolved.metadata:
if progress_callback:
progress_callback(i + 1, total, f'Looking up reference {ref_num}...')
# Try DOI lookup first
if resolved.doi:
cr_ref = lookup_doi(resolved.doi)
if cr_ref:
resolved.metadata = cr_ref
resolved.source = 'crossref_doi'
# Fall back to text search
if not resolved.metadata:
cr_results = search_reference(text, rows=1)
if cr_results:
resolved.metadata = cr_results[0]
resolved.source = 'crossref_search'
if not resolved.doi and resolved.metadata.get('doi'):
resolved.doi = resolved.metadata['doi']
# Step 4: Format the reference
if resolved.metadata:
resolved.formatted_runs = format_reference_mdpi_runs(
resolved.metadata, resolved.index)
else:
# Unmatched — preserve original as-is but fix numbering to N.\t format
clean_text = re.sub(r'^\[?\d+\]?\s*', '', text)
resolved.formatted_runs = [
{'text': f'{resolved.index}.\t{clean_text}',
'bold': False, 'italic': False,
'superscript': False, 'subscript': False}
]
results.append(resolved)
if progress_callback:
progress_callback(i + 1, total, f'Resolved {i + 1}/{total} references')
return results
def get_resolution_stats(results):
"""Get statistics about reference resolution.
Returns dict with counts per source and list of unmatched indices.
"""
stats = {
'total': len(results),
'ris': 0,
'crossref_doi': 0,
'crossref_search': 0,
'unmatched': 0,
'unmatched_indices': [],
}
for r in results:
stats[r.source] += 1
if r.source == 'unmatched':
stats['unmatched_indices'].append(r.index)
return stats