diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index 4c304af6..2258e008 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -774,10 +774,17 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization): with open(repo_zip_file, "wb") as f: f.write(repo_zip) - with zipfile.ZipFile(repo_zip_file, "r") as zip_ref: - zip_ref.extractall(repo_extract_dir) - + try: + with zipfile.ZipFile(repo_zip_file, "r") as zip_ref: + zip_ref.extractall(repo_extract_dir) + except zipfile.BadZipFile: + logging.error("Downloaded archive is not a valid zip (repo may be empty)") + return None + repo_folders = os.listdir(repo_extract_dir) + if not repo_folders: + logging.warning("Repository archive is empty") + return None repo_dir = os.path.join(repo_extract_dir, repo_folders[0]) return repo_dir diff --git a/src/somef/test/test_JSON_export.py b/src/somef/test/test_JSON_export.py index f92368a3..33ad9aa7 100644 --- a/src/somef/test/test_JSON_export.py +++ b/src/somef/test/test_JSON_export.py @@ -427,7 +427,7 @@ def test_issue_830(self): # data = text_file.read() # text_file.close() # json_content = json.loads(data) - + # assert "description" in json_content, "Missing 'description' property" # assert len(json_content["description"]) > 0, "Description list is empty" @@ -438,5 +438,37 @@ def test_issue_830(self): # os.remove(test_data_path + "test_issue_862.json") + def test_issue_859(self): + """Checks whether a repository without content works fine. Must have just some results from the API.""" + + somef_cli.run_cli(threshold=0.8, + ignore_classifiers=False, + repo_url="https://github.com/shiningZZ/GU-CAFF", + local_repo=None, + doc_src=None, + in_file=None, + output=test_data_path + "test-859.json", + graph_out=None, + graph_format="turtle", + codemeta_out= None, + pretty=True, + missing=False, + readme_only=False) + + with open(test_data_path + "test-859.json", "r") as text_file: + json_content = json.load(text_file) + + assert "code_repository" in json_content + assert len(json_content["code_repository"]) > 0 + + for key, entries in json_content.items(): + if isinstance(entries, list): + for entry in entries: + assert entry.get("technique") == "GitHub_API", \ + f"Unexpected technique {entry.get('technique')} in key {key}" + + os.remove(test_data_path + "test-859.json") + + if __name__ == '__main__': unittest.main() diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index ea084231..1775da1a 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -24,7 +24,9 @@ REGEXP_PYPI_2 = "[![Latest PyPI version]" REGEXP_COLAB = "https://colab.research.google.com/drive" # needed to cleanup bibtext files. -REGEXP_BIBTEX = r'\@[a-zA-Z]+\{[.\n\S\s]+?[author|title][.\n\S\s]+?[author|title][.\n\S\s]+?\n\}' +# REGEXP_BIBTEX = r'\@[a-zA-Z]+\{[.\n\S\s]+?[author|title][.\n\S\s]+?[author|title][.\n\S\s]+?\n\}' +# REGEXP_BIBTEX = r'@[a-zA-Z]+\{[\s\S]*?(?:author|title)[\s\S]*?(?:author|title)[\s\S]*?\}' +REGEXP_BIBTEX = r'@[a-zA-Z]+\{(?=[\s\S]*\bauthor\b)(?=[\s\S]*\btitle\b)[\s\S]*?\}' REGEXP_DOI = r'\[\!\[DOI\]([^\]]+)\]\(([^)]+)\)' REGEXP_LINKS = r"\[(.*?)?\]\(([^)]+)\)" REGEXP_IMAGES = r"!\[(.*?)?\]\((.*?)?\)"