From f4138defdf45cfa980ac3b07b3f6dfe40459236e Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Wed, 17 Dec 2025 13:47:34 +0100 Subject: [PATCH 1/3] Solve empty repos and problematic repo when dividing the text for classification. Fixes #859, Fixes #891, Fixes #862 --- src/somef/process_repository.py | 13 ++++++-- src/somef/test/test_JSON_export.py | 50 +++++++++++++++--------------- src/somef/utils/constants.py | 4 ++- 3 files changed, 38 insertions(+), 29 deletions(-) diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index 4c304af6..2258e008 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -774,10 +774,17 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization): with open(repo_zip_file, "wb") as f: f.write(repo_zip) - with zipfile.ZipFile(repo_zip_file, "r") as zip_ref: - zip_ref.extractall(repo_extract_dir) - + try: + with zipfile.ZipFile(repo_zip_file, "r") as zip_ref: + zip_ref.extractall(repo_extract_dir) + except zipfile.BadZipFile: + logging.error("Downloaded archive is not a valid zip (repo may be empty)") + return None + repo_folders = os.listdir(repo_extract_dir) + if not repo_folders: + logging.warning("Repository archive is empty") + return None repo_dir = os.path.join(repo_extract_dir, repo_folders[0]) return repo_dir diff --git a/src/somef/test/test_JSON_export.py b/src/somef/test/test_JSON_export.py index f92368a3..d1760e10 100644 --- a/src/somef/test/test_JSON_export.py +++ b/src/somef/test/test_JSON_export.py @@ -407,36 +407,36 @@ def test_issue_830(self): # except Exception as e: # print(f"Failed to delete {cls.json_file}: {e}") - # def test_issue_862(self): - # """Checks if this repository does not gets stuck when labeling headers""" - # somef_cli.run_cli(threshold=0.8, - # ignore_classifiers=False, - # repo_url=None, - # local_repo=test_data_repositories + "componentInstaller", - # doc_src=None, - # in_file=None, - # output=test_data_path + "test_issue_862.json", - # graph_out=None, - # graph_format="turtle", - # codemeta_out=None, - # pretty=True, - # missing=False, - # readme_only=False) + def test_issue_862(self): + """Checks if this repository does not gets stuck when labeling headers""" + somef_cli.run_cli(threshold=0.8, + ignore_classifiers=False, + repo_url=None, + local_repo=test_data_repositories + "componentInstaller", + doc_src=None, + in_file=None, + output=test_data_path + "test_issue_862.json", + graph_out=None, + graph_format="turtle", + codemeta_out=None, + pretty=True, + missing=False, + readme_only=False) - # text_file = open(test_data_path + "test_issue_862.json", "r") - # data = text_file.read() - # text_file.close() - # json_content = json.loads(data) + text_file = open(test_data_path + "test_issue_862.json", "r") + data = text_file.read() + text_file.close() + json_content = json.loads(data) - # assert "description" in json_content, "Missing 'description' property" + assert "description" in json_content, "Missing 'description' property" - # assert len(json_content["description"]) > 0, "Description list is empty" + assert len(json_content["description"]) > 0, "Description list is empty" - # first_desc = json_content["description"][0]["result"] - # assert "value" in first_desc, "Missing 'value' in description result" - # assert first_desc["value"], "Description 'value' is empty" + first_desc = json_content["description"][0]["result"] + assert "value" in first_desc, "Missing 'value' in description result" + assert first_desc["value"], "Description 'value' is empty" - # os.remove(test_data_path + "test_issue_862.json") + os.remove(test_data_path + "test_issue_862.json") if __name__ == '__main__': unittest.main() diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index ea084231..93be2d05 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -24,7 +24,9 @@ REGEXP_PYPI_2 = "[![Latest PyPI version]" REGEXP_COLAB = "https://colab.research.google.com/drive" # needed to cleanup bibtext files. -REGEXP_BIBTEX = r'\@[a-zA-Z]+\{[.\n\S\s]+?[author|title][.\n\S\s]+?[author|title][.\n\S\s]+?\n\}' +# REGEXP_BIBTEX = r'\@[a-zA-Z]+\{[.\n\S\s]+?[author|title][.\n\S\s]+?[author|title][.\n\S\s]+?\n\}' +REGEXP_BIBTEX = r'@[a-zA-Z]+\{[\s\S]*?(author|title)[\s\S]*?(author|title)[\s\S]*?\}' + REGEXP_DOI = r'\[\!\[DOI\]([^\]]+)\]\(([^)]+)\)' REGEXP_LINKS = r"\[(.*?)?\]\(([^)]+)\)" REGEXP_IMAGES = r"!\[(.*?)?\]\((.*?)?\)" From d34c0b4c5890af31fff2243df716b82a0052c01d Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Wed, 17 Dec 2025 16:17:27 +0100 Subject: [PATCH 2/3] test empty repo. --- src/somef/test/test_JSON_export.py | 84 +++++++++++++++++++++--------- src/somef/utils/constants.py | 2 +- 2 files changed, 59 insertions(+), 27 deletions(-) diff --git a/src/somef/test/test_JSON_export.py b/src/somef/test/test_JSON_export.py index d1760e10..33ad9aa7 100644 --- a/src/somef/test/test_JSON_export.py +++ b/src/somef/test/test_JSON_export.py @@ -407,36 +407,68 @@ def test_issue_830(self): # except Exception as e: # print(f"Failed to delete {cls.json_file}: {e}") - def test_issue_862(self): - """Checks if this repository does not gets stuck when labeling headers""" - somef_cli.run_cli(threshold=0.8, - ignore_classifiers=False, - repo_url=None, - local_repo=test_data_repositories + "componentInstaller", - doc_src=None, - in_file=None, - output=test_data_path + "test_issue_862.json", - graph_out=None, - graph_format="turtle", - codemeta_out=None, - pretty=True, - missing=False, - readme_only=False) + # def test_issue_862(self): + # """Checks if this repository does not gets stuck when labeling headers""" + # somef_cli.run_cli(threshold=0.8, + # ignore_classifiers=False, + # repo_url=None, + # local_repo=test_data_repositories + "componentInstaller", + # doc_src=None, + # in_file=None, + # output=test_data_path + "test_issue_862.json", + # graph_out=None, + # graph_format="turtle", + # codemeta_out=None, + # pretty=True, + # missing=False, + # readme_only=False) - text_file = open(test_data_path + "test_issue_862.json", "r") - data = text_file.read() - text_file.close() - json_content = json.loads(data) - - assert "description" in json_content, "Missing 'description' property" + # text_file = open(test_data_path + "test_issue_862.json", "r") + # data = text_file.read() + # text_file.close() + # json_content = json.loads(data) + + # assert "description" in json_content, "Missing 'description' property" - assert len(json_content["description"]) > 0, "Description list is empty" + # assert len(json_content["description"]) > 0, "Description list is empty" - first_desc = json_content["description"][0]["result"] - assert "value" in first_desc, "Missing 'value' in description result" - assert first_desc["value"], "Description 'value' is empty" + # first_desc = json_content["description"][0]["result"] + # assert "value" in first_desc, "Missing 'value' in description result" + # assert first_desc["value"], "Description 'value' is empty" - os.remove(test_data_path + "test_issue_862.json") + # os.remove(test_data_path + "test_issue_862.json") + + def test_issue_859(self): + """Checks whether a repository without content works fine. Must have just some results from the API.""" + + somef_cli.run_cli(threshold=0.8, + ignore_classifiers=False, + repo_url="https://github.com/shiningZZ/GU-CAFF", + local_repo=None, + doc_src=None, + in_file=None, + output=test_data_path + "test-859.json", + graph_out=None, + graph_format="turtle", + codemeta_out= None, + pretty=True, + missing=False, + readme_only=False) + + with open(test_data_path + "test-859.json", "r") as text_file: + json_content = json.load(text_file) + + assert "code_repository" in json_content + assert len(json_content["code_repository"]) > 0 + + for key, entries in json_content.items(): + if isinstance(entries, list): + for entry in entries: + assert entry.get("technique") == "GitHub_API", \ + f"Unexpected technique {entry.get('technique')} in key {key}" + + os.remove(test_data_path + "test-859.json") + if __name__ == '__main__': unittest.main() diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index 93be2d05..cbf3f85d 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -25,7 +25,7 @@ REGEXP_COLAB = "https://colab.research.google.com/drive" # needed to cleanup bibtext files. # REGEXP_BIBTEX = r'\@[a-zA-Z]+\{[.\n\S\s]+?[author|title][.\n\S\s]+?[author|title][.\n\S\s]+?\n\}' -REGEXP_BIBTEX = r'@[a-zA-Z]+\{[\s\S]*?(author|title)[\s\S]*?(author|title)[\s\S]*?\}' +REGEXP_BIBTEX = r'@[a-zA-Z]+\{[\s\S]*?(?:author|title)[\s\S]*?(?:author|title)[\s\S]*?\}' REGEXP_DOI = r'\[\!\[DOI\]([^\]]+)\]\(([^)]+)\)' REGEXP_LINKS = r"\[(.*?)?\]\(([^)]+)\)" From fa749fed9f67e81296e4093110bc7f30e3bfd033 Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Wed, 17 Dec 2025 16:45:04 +0100 Subject: [PATCH 3/3] bibtext regexp ambiguous --- src/somef/utils/constants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index cbf3f85d..1775da1a 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -25,8 +25,8 @@ REGEXP_COLAB = "https://colab.research.google.com/drive" # needed to cleanup bibtext files. # REGEXP_BIBTEX = r'\@[a-zA-Z]+\{[.\n\S\s]+?[author|title][.\n\S\s]+?[author|title][.\n\S\s]+?\n\}' -REGEXP_BIBTEX = r'@[a-zA-Z]+\{[\s\S]*?(?:author|title)[\s\S]*?(?:author|title)[\s\S]*?\}' - +# REGEXP_BIBTEX = r'@[a-zA-Z]+\{[\s\S]*?(?:author|title)[\s\S]*?(?:author|title)[\s\S]*?\}' +REGEXP_BIBTEX = r'@[a-zA-Z]+\{(?=[\s\S]*\bauthor\b)(?=[\s\S]*\btitle\b)[\s\S]*?\}' REGEXP_DOI = r'\[\!\[DOI\]([^\]]+)\]\(([^)]+)\)' REGEXP_LINKS = r"\[(.*?)?\]\(([^)]+)\)" REGEXP_IMAGES = r"!\[(.*?)?\]\((.*?)?\)"