diff --git a/src/somef/header_analysis.py b/src/somef/header_analysis.py
index 1545fdbb..809f1fa5 100644
--- a/src/somef/header_analysis.py
+++ b/src/somef/header_analysis.py
@@ -109,7 +109,9 @@ def extract_header_content(text):
df = pd.concat(dfs, ignore_index=True)
# for i, j in zip(header, content):
# df = df.append({'Header': i, 'Content': j, 'ParentHeader': parent_headers[i]}, ignore_index=True)
- df['Content'].replace('', np.nan, inplace=True)
+ # df['Content'].replace('', np.nan, inplace=True)
+ df['Content'] = df['Content'].replace('', np.nan)
+
df.dropna(subset=['Content'], inplace=True)
return df, none_header_content
@@ -221,13 +223,15 @@ def extract_categories(repo_data, repository_metadata: Result):
data.at[i, 'Group'] = data['GroupParent'][i]
data = data.drop(columns=['GroupParent'])
if len(data['Group'].iloc[0]) == 0:
- data['Group'].iloc[0] = ['unknown']
+ # data['Group'].iloc[0] = ['unknown']
+ data.loc[0, 'Group'] = ['unknown']
groups = data.apply(lambda x: pd.Series(x['Group']), axis=1).stack().reset_index(level=1, drop=True)
groups.name = 'Group'
data = data.drop('Group', axis=1).join(groups)
if data['Group'].iloc[0] == 'unknown':
- data['Group'].iloc[0] = np.NaN
+ # data['Group'].iloc[0] = np.NaN
+ data.loc[0, 'Group'] = np.nan
# to json
group = data.loc[(data['Group'] != 'None') & pd.notna(data['Group'])]
diff --git a/src/somef/parser/bower_parser.py b/src/somef/parser/bower_parser.py
index f67666af..14793886 100644
--- a/src/somef/parser/bower_parser.py
+++ b/src/somef/parser/bower_parser.py
@@ -24,7 +24,8 @@ def parse_bower_json_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
- "value": "bower.json",
+ # "value": "bower.json",
+ "value": source,
"type": constants.URL,
},
1,
diff --git a/src/somef/parser/cabal_parser.py b/src/somef/parser/cabal_parser.py
index 9bc62662..74ed7bd2 100644
--- a/src/somef/parser/cabal_parser.py
+++ b/src/somef/parser/cabal_parser.py
@@ -24,7 +24,8 @@ def parse_cabal_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
- "value": Path(file_path).name,
+ # "value": Path(file_path).name,
+ "value": source,
"type": constants.URL,
},
1,
diff --git a/src/somef/parser/composer_parser.py b/src/somef/parser/composer_parser.py
index 664c9db2..d1b07946 100644
--- a/src/somef/parser/composer_parser.py
+++ b/src/somef/parser/composer_parser.py
@@ -25,7 +25,8 @@ def parse_composer_json(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
- "value": "composer.json",
+ # "value": "composer.json",
+ "value": source,
"type": constants.URL,
},
1,
diff --git a/src/somef/parser/description_parser.py b/src/somef/parser/description_parser.py
index b3604340..0da8f8fd 100644
--- a/src/somef/parser/description_parser.py
+++ b/src/somef/parser/description_parser.py
@@ -24,7 +24,8 @@ def parse_description_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
- "value": "DESCRIPTION",
+ # "value": "DESCRIPTION",
+ "value": source,
"type": constants.URL,
"source": source
},
diff --git a/src/somef/parser/gemspec_parser.py b/src/somef/parser/gemspec_parser.py
index 0df763b5..6714b012 100644
--- a/src/somef/parser/gemspec_parser.py
+++ b/src/somef/parser/gemspec_parser.py
@@ -27,7 +27,8 @@ def parse_gemspec_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
- "value": Path(file_path).name,
+ # "value": Path(file_path).name,
+ "value": source,
"type": constants.URL,
},
1,
diff --git a/src/somef/parser/package_json_parser.py b/src/somef/parser/package_json_parser.py
index 8247645b..a0884c5e 100644
--- a/src/somef/parser/package_json_parser.py
+++ b/src/somef/parser/package_json_parser.py
@@ -188,7 +188,8 @@ def parse_package_json_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
- "value": "package.json",
+ # "value": "package.json",
+ "value": source,
"type": constants.URL,
},
1,
diff --git a/src/somef/parser/pom_xml_parser.py b/src/somef/parser/pom_xml_parser.py
index ddedbdae..dfb9c1cc 100644
--- a/src/somef/parser/pom_xml_parser.py
+++ b/src/somef/parser/pom_xml_parser.py
@@ -233,7 +233,8 @@ def parse_pom_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
- "value": "pom.xml",
+ # "value": "pom.xml",
+ "value": source,
"type": constants.URL
},
1,
diff --git a/src/somef/parser/toml_parser.py b/src/somef/parser/toml_parser.py
index 202c0cc9..e87269d0 100644
--- a/src/somef/parser/toml_parser.py
+++ b/src/somef/parser/toml_parser.py
@@ -45,7 +45,8 @@ def parse_toml_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
- "value": display_name,
+ # "value": display_name,
+ "value": source,
"type": constants.URL,
},
1,
diff --git a/src/somef/regular_expressions.py b/src/somef/regular_expressions.py
index bfe5dc00..7744d240 100644
--- a/src/somef/regular_expressions.py
+++ b/src/somef/regular_expressions.py
@@ -689,6 +689,7 @@ def extract_readthedocs_badgeds(readme_text, repository_metadata: Result, source
# RST
for match in re.findall(constants.REGEXP_READTHEDOCS_RST, readme_text):
+ print(match)
if isinstance(match, tuple):
urls.update([u for u in match if u])
elif match:
@@ -702,15 +703,21 @@ def extract_readthedocs_badgeds(readme_text, repository_metadata: Result, source
urls.add(match)
# HTML
- for match in re.findall(constants.REGEXP_READTHEDOCS_HTML, readme_text):
+ pattern_html = re.compile(constants.REGEXP_READTHEDOCS_HTML, flags=re.DOTALL | re.IGNORECASE)
+ for match in pattern_html.findall(readme_text):
if isinstance(match, tuple):
urls.update([u for u in match if u])
elif match:
urls.add(match)
for url in urls:
+ if "pypi.org/project" in url:
+ category = constants.CAT_PACKAGE_DISTRIBUTION
+ else:
+ category = constants.CAT_DOCUMENTATION
+
repository_metadata.add_result(
- constants.CAT_DOCUMENTATION,
+ category,
{
constants.PROP_TYPE: constants.URL,
constants.PROP_VALUE: url
@@ -722,28 +729,28 @@ def extract_readthedocs_badgeds(readme_text, repository_metadata: Result, source
return repository_metadata
-def extract_package_manager_badgeds(readme_text, repository_metadata: Result, source) -> Result:
- """
- Function that takes the text of a readme file and searches if there are package manager badges.
- Parameters
- ----------
- @param readme_text: Text of the readme
- @param repository_metadata: Result with all the findings in the repo
- @param source: source file on top of which the extraction is performed (provenance)
- Returns
- -------
- @returns Result with the package badges found
- """
- package_manager_badges = re.findall(constants.REGEXP_READTHEDOCS_BADGES, readme_text, re.DOTALL)
- for package in package_manager_badges:
- repository_metadata.add_result(constants.CAT_DOCUMENTATION,
- {
- constants.PROP_TYPE: constants.URL,
- constants.PROP_VALUE: package
- }, 1, constants.TECHNIQUE_REGULAR_EXPRESSION, source)
+# def extract_package_manager_badgeds(readme_text, repository_metadata: Result, source) -> Result:
+# """
+# Function that takes the text of a readme file and searches if there are package manager badges.
+# Parameters
+# ----------
+# @param readme_text: Text of the readme
+# @param repository_metadata: Result with all the findings in the repo
+# @param source: source file on top of which the extraction is performed (provenance)
+# Returns
+# -------
+# @returns Result with the package badges found
+# """
+# package_manager_badges = re.findall(constants.REGEXP_READTHEDOCS_BADGES, readme_text, re.DOTALL)
+# for package in package_manager_badges:
+# repository_metadata.add_result(constants.CAT_DOCUMENTATION,
+# {
+# constants.PROP_TYPE: constants.URL,
+# constants.PROP_VALUE: package
+# }, 1, constants.TECHNIQUE_REGULAR_EXPRESSION, source)
- return repository_metadata
+# return repository_metadata
def extract_swh_badges(readme_text, repository_metadata: Result, source) -> Result:
diff --git a/src/somef/test/test_bower_parser.py b/src/somef/test/test_bower_parser.py
index bb51f551..1d07c200 100644
--- a/src/somef/test/test_bower_parser.py
+++ b/src/somef/test/test_bower_parser.py
@@ -19,7 +19,8 @@ def test_parse_bower_json(self):
metadata_result = parse_bower_json_file(bower_file_path, result, "https://example.org/bower.json")
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "bower.json")
+ # self.assertEqual(package_results[0]["result"]["value"], "bower.json")
+ self.assertEqual(package_results[0]["result"]["value"], "https://example.org/bower.json")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
name_results = metadata_result.results.get(constants.CAT_NAME, [])
@@ -73,7 +74,8 @@ def test_parse_2_bower_json(self):
metadata_result = parse_bower_json_file(bower_file_path, result, "https://example.org/bower.json")
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "bower.json")
+ # self.assertEqual(package_results[0]["result"]["value"], "bower.json")
+ self.assertEqual(package_results[0]["result"]["value"], "https://example.org/bower.json")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
name_results = metadata_result.results.get(constants.CAT_NAME, [])
diff --git a/src/somef/test/test_cabal_parser.py b/src/somef/test/test_cabal_parser.py
index c6259c99..2996a400 100644
--- a/src/somef/test/test_cabal_parser.py
+++ b/src/somef/test/test_cabal_parser.py
@@ -19,7 +19,8 @@ def test_parse_cabal(self):
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "unused.cabal")
+ # self.assertEqual(package_results[0]["result"]["value"], "unused.cabal")
+ self.assertEqual(package_results[0]["result"]["value"], "https://example.org/unused.cabal")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
@@ -60,7 +61,8 @@ def test_parse_2_cabal(self):
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "cabal.cabal")
+ # self.assertEqual(package_results[0]["result"]["value"], "cabal.cabal")
+ self.assertEqual(package_results[0]["result"]["value"], "https://example.org/cabal.cabal")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
description_results = metadata_result.results.get(constants.CAT_DESCRIPTION, [])
self.assertTrue(len(description_results) > 0, "No description found")
diff --git a/src/somef/test/test_description_parser.py b/src/somef/test/test_description_parser.py
index b613c71c..1460dc20 100644
--- a/src/somef/test/test_description_parser.py
+++ b/src/somef/test/test_description_parser.py
@@ -28,7 +28,8 @@ def test_description(self):
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "DESCRIPTION")
+ # self.assertEqual(package_results[0]["result"]["value"], "DESCRIPTION")
+ self.assertEqual(package_results[0]["result"]["value"], "https://example.org/DESCRIPTION")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
@@ -59,7 +60,7 @@ def test_description_2(self):
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "DESCRIPTION")
+ self.assertEqual(package_results[0]["result"]["value"], "https://example.org/DESCRIPTION")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
diff --git a/src/somef/test/test_gemspec_parser.py b/src/somef/test/test_gemspec_parser.py
index afa07545..5c7533fe 100644
--- a/src/somef/test/test_gemspec_parser.py
+++ b/src/somef/test/test_gemspec_parser.py
@@ -22,7 +22,8 @@ def test_parse_gemspec(self):
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "bootstrap-datepicker-rails.gemspec")
+ # self.assertEqual(package_results[0]["result"]["value"], "bootstrap-datepicker-rails.gemspec")
+ self.assertEqual(package_results[0]["result"]["value"], "https://example.org/bootstrap-datepicker-rails.gemspec")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
diff --git a/src/somef/test/test_package_json_parser.py b/src/somef/test/test_package_json_parser.py
index 9e91dd6c..066e0594 100644
--- a/src/somef/test/test_package_json_parser.py
+++ b/src/somef/test/test_package_json_parser.py
@@ -58,7 +58,8 @@ def test_parse_package_json_file(self):
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "package.json")
+ # self.assertEqual(package_results[0]["result"]["value"], "package.json")
+ self.assertEqual(package_results[0]["result"]["value"], "http://example.com/package_neors.json")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
keywords_results = metadata_result.results.get(constants.CAT_KEYWORDS, [])
diff --git a/src/somef/test/test_pom_parser.py b/src/somef/test/test_pom_parser.py
index 6a28ffb2..f51c38fd 100644
--- a/src/somef/test/test_pom_parser.py
+++ b/src/somef/test/test_pom_parser.py
@@ -34,7 +34,8 @@ def test_parse_pom_file(self):
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "pom.xml")
+ # self.assertEqual(package_results[0]["result"]["value"], "pom.xml")
+ self.assertEqual(package_results[0]["result"]["value"], "https://example.org/pom.xml")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
requirements_results = metadata_result.results.get(constants.CAT_REQUIREMENTS, [])
diff --git a/src/somef/test/test_regular_expressions.py b/src/somef/test/test_regular_expressions.py
index 76f10a7d..ea801af4 100644
--- a/src/somef/test/test_regular_expressions.py
+++ b/src/somef/test/test_regular_expressions.py
@@ -446,6 +446,7 @@ def test_issue_771(self):
def test_issue_860(self):
"""Test designed to check redthedocs links are extracted correctly when multiple links are present """
documentation_values = []
+ package_values = []
for readme_file in ["README-menty.md", "README-uncbiag.md"]:
with open(test_data_path + readme_file, "r") as data_file:
@@ -461,14 +462,27 @@ def test_issue_860(self):
if isinstance(value, str):
documentation_values.append(value)
+ if "package_distribution" in documentation.results:
+ for result in documentation.results["package_distribution"]:
+ value = result.get("result", {}).get("value")
+ if isinstance(value, str):
+ package_values.append(value)
+
expected_doc_urls = {
- "https://pypi.org/project/mentpy",
+ "https://docs.mentpy.com/en/latest/?badge=latest",
"https://icon.readthedocs.io/en/master/"
}
-
+
+ expected_package_urls = {
+ "https://pypi.org/project/mentpy"
+ }
+
assert expected_doc_urls.issubset(set(documentation_values)), (
f"Expected documentation URLs {expected_doc_urls} not found in {documentation_values}"
)
+ assert expected_package_urls.issubset(set(package_values)), (
+ f"Pypy package {expected_package_urls} not foun in package_distribution: {package_values}"
+ )
def test_readme_rst_readthedocs(self):
"""Test designed to check whether rst readmes get stuck in extracting documentation """
diff --git a/src/somef/test/test_toml_parser.py b/src/somef/test/test_toml_parser.py
index b50dd4a6..a1653b43 100644
--- a/src/somef/test/test_toml_parser.py
+++ b/src/somef/test/test_toml_parser.py
@@ -56,11 +56,15 @@ def test_parse_cargo_toml(self):
)
result = Result()
- parse_toml_file(self.cargo_toml_path, result, "test")
+ # parse_toml_file(self.cargo_toml_path, result, "test")
+ print("self.cargo_toml_path:", self.cargo_toml_path)
+ parse_toml_file(self.cargo_toml_path, result, "http://example.com/rustdesk/Cargo.toml")
self.assertIn(constants.CAT_HAS_PACKAGE_FILE, result.results)
package_file = result.results[constants.CAT_HAS_PACKAGE_FILE][0]["result"]["value"]
- self.assertEqual(package_file, "Cargo.toml")
+
+ # self.assertEqual(package_file, "Cargo.toml")
+ self.assertEqual(package_file, "http://example.com/rustdesk/Cargo.toml")
self.assertIn(constants.CAT_PACKAGE_ID, result.results)
package_id = result.results[constants.CAT_PACKAGE_ID][0]["result"]["value"]
@@ -102,7 +106,8 @@ def test_parse_pluto_project_toml(self):
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "Project.toml")
+ # self.assertEqual(package_results[0]["result"]["value"], "Project.toml")
+ self.assertEqual(package_results[0]["result"]["value"], "http://example.com/repo1/Project.toml")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
self.assertEqual(package_results[0]["technique"], constants.TECHNIQUE_CODE_CONFIG_PARSER)
@@ -168,7 +173,8 @@ def test_parse_flux_project_toml(self):
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "Project.toml")
+ # self.assertEqual(package_results[0]["result"]["value"], "Project.toml")
+ self.assertEqual(package_results[0]["result"]["value"], "http://example.com/repo2/Project.toml")
package_id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
self.assertTrue(len(package_id_results) > 0, "No package ID found")
diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py
index 1775da1a..e4052d9c 100644
--- a/src/somef/utils/constants.py
+++ b/src/somef/utils/constants.py
@@ -54,12 +54,12 @@
REGEXP_READTHEDOCS_MD = (
r"\(\s*(https?://[^\s\)]+\.readthedocs\.io[^\s\)]*)\s*\)"
)
-# REGEXP_READTHEDOCS_HTML = (
-# r"]+href=['\"](https?://[^\s\"']+\.readthedocs\.io[^\s\"']*)['\"][^>]*>"
-# r"(?:\s|<[^>]+>)*"
-# r"
]+src=['\"]https?://readthedocs\.org/projects/[^\s/]+/badge/[^\s\"']*"
-# )
-REGEXP_READTHEDOCS_HTML = ( r"]+href=['\"](https?://[^\s\"']+)['\"][^>]*>" r"(?:\s|<[^>]+>)*" r"
]+src=['\"]https?://readthedocs\.org/projects/[^\s/]+/badge/[^\s\"']*" )
+
+REGEXP_READTHEDOCS_HTML = (
+ r"]+?href=['\"](https?://[^'\"\s]+?)['\"][^>]*?>"
+ r"(?:(?!)[\s\S])*?"
+ r"
]+?src=['\"]https?://(?:readthedocs\.org/projects/|img\.shields\.io/pypi/)[^'\"\s]*"
+)
# For natural language citation
REGEXP_DOI_NATURAL = r'10\.\d{4,9}/[-._;()/:A-Za-z0-9]+'
REGEXP_YEAR_NATURAL = r'\b(19|20)\d{2}\b'