diff --git a/main.py b/main.py index 270ba26..bcb94ba 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,4 @@ from scraping.parse import Scraper -from scraping.quartile import quartile_of from scraping.research_gate import members_of_Inno if __name__ == '__main__': diff --git a/scraping/parse.py b/scraping/parse.py index 2b89ce8..11b9c75 100644 --- a/scraping/parse.py +++ b/scraping/parse.py @@ -11,6 +11,7 @@ def __init__(self): key = file.readline() self.client = ElsClient(key) self.client.__min_req_interval = 1 # Set request interval. Reduce for minor speed up at the cost of stability. + dataframe = pd.read_excel('quartiles.xlsx', sheet_name=None) def parse(self, preload=True): print('Scraper', id(self), 'began parsing') @@ -135,3 +136,23 @@ def _get_papers(self): papers.to_csv('data/papers.csv', index=False) return papers + def quartile_of(self, a: int, year: str): + df = dataframe["CiteScore " + year] + low = 0 + high = df.shape[0] - 1 + h = high + while low <= high: + mid = (high + low) // 2 + if df.iloc[mid, 0] < a: + low = mid + 1 + elif df.iloc[mid, 0] > a: + high = mid - 1 + else: + low, high = mid, mid + while df.iloc[low, 0] == mid and low - 1 != -1: + low -= 1 + while df.iloc[high, 0] == mid and high + 1 != h + 1: + high += 1 + high += 1 + return df.loc[low:high, "Quartile"].min() + return -1 diff --git a/scraping/quartile.py b/scraping/quartile.py deleted file mode 100644 index ec5fe33..0000000 --- a/scraping/quartile.py +++ /dev/null @@ -1,23 +0,0 @@ -import pandas as pd - - -def quartile_of(a: int): - df = pd.read_excel('quartiles2020.xlsx', sheet_name=0) - low = 0 - high = df.shape[0] - 1 - h = high - while low <= high: - mid = (high + low) // 2 - if df.iloc[mid, 0] < a: - low = mid + 1 - elif df.iloc[mid, 0] > a: - high = mid - 1 - else: - low, high = mid, mid - while df.iloc[low, 0] == mid and low - 1 != -1: - low -= 1 - while df.iloc[high, 0] == mid and high + 1 != h + 1: - high += 1 - high += 1 - return df.loc[low:high, "Quartile"].min() - return -1 diff --git a/scraping/quartiles.xlsx b/scraping/quartiles.xlsx new file mode 100644 index 0000000..b8d056a Binary files /dev/null and b/scraping/quartiles.xlsx differ diff --git a/scraping/quartiles2020.xlsx b/scraping/quartiles2020.xlsx deleted file mode 100644 index 557226f..0000000 Binary files a/scraping/quartiles2020.xlsx and /dev/null differ