journal_formatting/reference_engine.py at main · g-pachakis/journal_formatting · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
Reference resolution engine.

Orchestrates the full pipeline: extract DOIs, match against RIS,
optionally look up via CrossRef, and format in the target style.
"""

import re
from dataclasses import dataclass, field

from crossref_client import extract_dois, lookup_doi, search_reference
from ris_parser import match_citation_to_ris
from citation_formatter import format_reference_mdpi_runs


@dataclass
class ResolvedReference:
    """A reference with resolved metadata and formatted output."""
    index: int
    original_text: str
    metadata: dict = field(default_factory=dict)
    source: str = 'unmatched'  # 'ris', 'crossref_doi', 'crossref_search', 'unmatched'
    doi: str = ''
    formatted_runs: list = field(default_factory=list)


def extract_ref_number(text):
    """Extract the reference number from text like '[1]' or '1.' at the start."""
    m = re.match(r'^\[?(\d+)\]?\.?\s*', text)
    return int(m.group(1)) if m else None


def resolve_references(ref_items, ris_data=None, use_crossref=False,
                       progress_callback=None):
    """Resolve all references through the matching pipeline.

    Args:
        ref_items: list of reader items where type == 'reference'
        ris_data: optional list of RIS records from ris_parser.parse_ris()
        use_crossref: whether to use CrossRef API for unmatched references
        progress_callback: optional callable(current, total, message) for GUI

    Returns:
        list of ResolvedReference objects
    """
    results = []
    total = len(ref_items)

    for i, item in enumerate(ref_items):
        text = item['text']
        ref_num = extract_ref_number(text) or (i + 1)

        resolved = ResolvedReference(
            index=ref_num,
            original_text=text,
        )

        # Step 1: Extract DOIs from reference text
        dois = extract_dois(text)
        if dois:
            resolved.doi = dois[0]

        # Step 2: Match against RIS data (if provided)
        if ris_data:
            # Try DOI match first (most reliable)
            if resolved.doi:
                for rec in ris_data:
                    if rec.get('doi') and rec['doi'].lower() == resolved.doi.lower():
                        resolved.metadata = rec
                        resolved.source = 'ris'
                        break

            # Fall back to fuzzy text matching
            if not resolved.metadata:
                match = match_citation_to_ris(text, ris_data)
                if match:
                    resolved.metadata = match
                    resolved.source = 'ris'

        # Step 3: CrossRef lookup (if enabled and still unmatched)
        if use_crossref and not resolved.metadata:
            if progress_callback:
                progress_callback(i + 1, total, f'Looking up reference {ref_num}...')

            # Try DOI lookup first
            if resolved.doi:
                cr_ref = lookup_doi(resolved.doi)
                if cr_ref:
                    resolved.metadata = cr_ref
                    resolved.source = 'crossref_doi'

            # Fall back to text search
            if not resolved.metadata:
                cr_results = search_reference(text, rows=1)
                if cr_results:
                    resolved.metadata = cr_results[0]
                    resolved.source = 'crossref_search'
                    if not resolved.doi and resolved.metadata.get('doi'):
                        resolved.doi = resolved.metadata['doi']

        # Step 4: Format the reference
        if resolved.metadata:
            resolved.formatted_runs = format_reference_mdpi_runs(
                resolved.metadata, resolved.index)
        else:
            # Unmatched — preserve original as-is but fix numbering to N.\t format
            clean_text = re.sub(r'^\[?\d+\]?\s*', '', text)
            resolved.formatted_runs = [
                {'text': f'{resolved.index}.\t{clean_text}',
                 'bold': False, 'italic': False,
                 'superscript': False, 'subscript': False}
            ]

        results.append(resolved)

        if progress_callback:
            progress_callback(i + 1, total, f'Resolved {i + 1}/{total} references')

    return results


def get_resolution_stats(results):
    """Get statistics about reference resolution.

    Returns dict with counts per source and list of unmatched indices.
    """
    stats = {
        'total': len(results),
        'ris': 0,
        'crossref_doi': 0,
        'crossref_search': 0,
        'unmatched': 0,
        'unmatched_indices': [],
    }
    for r in results:
        stats[r.source] += 1
        if r.source == 'unmatched':
            stats['unmatched_indices'].append(r.index)
    return stats