GenBankRefs/pubmed_search.py at main · hivdb/GenBankRefs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from Bio import Entrez
import pandas as pd
import time
from itertools import combinations

Entrez.email = "rshafer.stanford.edu"


def search_pubmed(query, db='pubmed', retmax=1):
    time.sleep(1)
    handle = Entrez.esearch(db=db, term=query, retmax=retmax)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]


def fetch_pubmed_details(pubmed_ids):
    handle = Entrez.efetch(
        db="pubmed", id=",".join(pubmed_ids),
        rettype="abstract",
        retmode="xml")
    records = Entrez.read(handle)
    handle.close()

    articles = []
    for article in records["PubmedArticle"]:
        title = article["MedlineCitation"]["Article"]["ArticleTitle"]
        # abstract = article["MedlineCitation"]["Article"].get("Abstract", {}).get("AbstractText", ["No abstract"])[0]
        authors = [
            author["LastName"] + " " + author["ForeName"]
            for author in article["MedlineCitation"]["Article"].get("AuthorList", [])
            if "LastName" in author and "ForeName" in author]
        pubmed_id = article["MedlineCitation"]["PMID"]
        articles.append({
            "PubMed ID": pubmed_id,
            "Title": title,
            "Authors": ", ".join(authors),
            # "Abstract": abstract
        })

    return articles


def search_by_pubmed_API(
        virus, genbank, overwrite=False):

    if virus.pubmed_search_result.exists() and not overwrite:
        return pd.read_excel(virus.pubmed_search_result)

    cache_file = virus.output_excel_dir / f'{virus.name}_pubmed_search.xlsx'

    answers = []

    if cache_file.exists():
        answers = pd.read_excel(cache_file)
        answer_map = {
            int(i['RefID']): i['PMID']
            for _, i in answers.iterrows()
        }
        for idx, row in genbank.iterrows():
            genbank.at[idx, 'PMID'] = answer_map.get(row['RefID'], '')

        answers = answers.to_dict(orient='records')

    genbank['PMID'] = genbank['PMID'].fillna('').astype(str)

    for idx, row in genbank.iterrows():
        if 'PMID' in row and row['PMID'].strip():
            continue

        title_pmid = []
        title = row['Title'].replace('Direct Submission', '').strip()
        if title:
            title_pmid = search_pubmed(title, retmax=1)

        author_pmid = []
        authors = row['Authors']
        if authors != 'NCBI':
            author_pmid += search_pubmed(authors, retmax=5)
            authors = list(set([
                i.strip()
                for i in authors.split(',')
            ]))
            combo = list(combinations(authors, 2))
            # print('# Combo', len(combo))
            for a1, a2 in combo:
                search_term = f'{a1} and {a2} and {virus.name}'
                author_pmid += search_pubmed(search_term, retmax=5)

        accession_pmids = []
        for i in row['accession'].split(','):
            accession_pmids.extend(search_pubmed(i.strip(), retmax=3))

        # accession_pmids_2 = []
        # for i in row['accession'].split(','):
        #     accession_pmids_2.extend(search_pubmed(i.strip(), db='pmc'))

        pmid = list(sorted(set(author_pmid) | set(accession_pmids)))

        genbank.loc[idx, 'PMID'] = ', '.join(
            [str(p) for p in pmid])

        answers.append({
            'RefID': row['RefID'],
            'Title': row['Title'],
            'Authors': row['Authors'],
            'Journal': row['Journal'] if row['Journal'].lower() != 'unpublished' else '',
            'Year': row['Year'] if row['Year'] else '',
            'Accession': row['accession'],
            'PMID': ', '.join([str(p) for p in pmid]),
            'PMID_title': ', '.join([str(p) for p in title_pmid]),
            'PMID_author': ', '.join([str(p) for p in set(author_pmid)]),
            'PMID_acc': ', '.join([str(p) for p in set(accession_pmids)]),
            # 'PMID_acc2': ', '.join([str(p) for p in set(accession_pmids_2)]),
        })
        print('pubmed search', row['RefID'])

        pd.DataFrame(answers).to_excel(cache_file, index=False)

    print('Please check PubMed Search result by hand.')
    exit()