Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions src/somef/header_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,9 @@ def extract_header_content(text):
df = pd.concat(dfs, ignore_index=True)
# for i, j in zip(header, content):
# df = df.append({'Header': i, 'Content': j, 'ParentHeader': parent_headers[i]}, ignore_index=True)
df['Content'].replace('', np.nan, inplace=True)
# df['Content'].replace('', np.nan, inplace=True)
df['Content'] = df['Content'].replace('', np.nan)

df.dropna(subset=['Content'], inplace=True)
return df, none_header_content

Expand Down Expand Up @@ -221,13 +223,15 @@ def extract_categories(repo_data, repository_metadata: Result):
data.at[i, 'Group'] = data['GroupParent'][i]
data = data.drop(columns=['GroupParent'])
if len(data['Group'].iloc[0]) == 0:
data['Group'].iloc[0] = ['unknown']
# data['Group'].iloc[0] = ['unknown']
data.loc[0, 'Group'] = ['unknown']
groups = data.apply(lambda x: pd.Series(x['Group']), axis=1).stack().reset_index(level=1, drop=True)

groups.name = 'Group'
data = data.drop('Group', axis=1).join(groups)
if data['Group'].iloc[0] == 'unknown':
data['Group'].iloc[0] = np.NaN
# data['Group'].iloc[0] = np.NaN
data.loc[0, 'Group'] = np.nan

# to json
group = data.loc[(data['Group'] != 'None') & pd.notna(data['Group'])]
Expand Down
3 changes: 2 additions & 1 deletion src/somef/parser/bower_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def parse_bower_json_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
"value": "bower.json",
# "value": "bower.json",
"value": source,
"type": constants.URL,
},
1,
Expand Down
3 changes: 2 additions & 1 deletion src/somef/parser/cabal_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def parse_cabal_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
"value": Path(file_path).name,
# "value": Path(file_path).name,
"value": source,
"type": constants.URL,
},
1,
Expand Down
3 changes: 2 additions & 1 deletion src/somef/parser/composer_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ def parse_composer_json(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
"value": "composer.json",
# "value": "composer.json",
"value": source,
"type": constants.URL,
},
1,
Expand Down
3 changes: 2 additions & 1 deletion src/somef/parser/description_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def parse_description_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
"value": "DESCRIPTION",
# "value": "DESCRIPTION",
"value": source,
"type": constants.URL,
"source": source
},
Expand Down
3 changes: 2 additions & 1 deletion src/somef/parser/gemspec_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def parse_gemspec_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
"value": Path(file_path).name,
# "value": Path(file_path).name,
"value": source,
"type": constants.URL,
},
1,
Expand Down
3 changes: 2 additions & 1 deletion src/somef/parser/package_json_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,8 @@ def parse_package_json_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
"value": "package.json",
# "value": "package.json",
"value": source,
"type": constants.URL,
},
1,
Expand Down
3 changes: 2 additions & 1 deletion src/somef/parser/pom_xml_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,8 @@ def parse_pom_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
"value": "pom.xml",
# "value": "pom.xml",
"value": source,
"type": constants.URL
},
1,
Expand Down
3 changes: 2 additions & 1 deletion src/somef/parser/toml_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ def parse_toml_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
"value": display_name,
# "value": display_name,
"value": source,
"type": constants.URL,
},
1,
Expand Down
51 changes: 29 additions & 22 deletions src/somef/regular_expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,7 @@ def extract_readthedocs_badgeds(readme_text, repository_metadata: Result, source

# RST
for match in re.findall(constants.REGEXP_READTHEDOCS_RST, readme_text):
print(match)
if isinstance(match, tuple):
urls.update([u for u in match if u])
elif match:
Expand All @@ -702,15 +703,21 @@ def extract_readthedocs_badgeds(readme_text, repository_metadata: Result, source
urls.add(match)

# HTML
for match in re.findall(constants.REGEXP_READTHEDOCS_HTML, readme_text):
pattern_html = re.compile(constants.REGEXP_READTHEDOCS_HTML, flags=re.DOTALL | re.IGNORECASE)
for match in pattern_html.findall(readme_text):
if isinstance(match, tuple):
urls.update([u for u in match if u])
elif match:
urls.add(match)

for url in urls:
if "pypi.org/project" in url:
category = constants.CAT_PACKAGE_DISTRIBUTION
else:
category = constants.CAT_DOCUMENTATION

repository_metadata.add_result(
constants.CAT_DOCUMENTATION,
category,
{
constants.PROP_TYPE: constants.URL,
constants.PROP_VALUE: url
Expand All @@ -722,28 +729,28 @@ def extract_readthedocs_badgeds(readme_text, repository_metadata: Result, source

return repository_metadata

def extract_package_manager_badgeds(readme_text, repository_metadata: Result, source) -> Result:
"""
Function that takes the text of a readme file and searches if there are package manager badges.
Parameters
----------
@param readme_text: Text of the readme
@param repository_metadata: Result with all the findings in the repo
@param source: source file on top of which the extraction is performed (provenance)
Returns
-------
@returns Result with the package badges found
"""
package_manager_badges = re.findall(constants.REGEXP_READTHEDOCS_BADGES, readme_text, re.DOTALL)
for package in package_manager_badges:
repository_metadata.add_result(constants.CAT_DOCUMENTATION,
{
constants.PROP_TYPE: constants.URL,
constants.PROP_VALUE: package
}, 1, constants.TECHNIQUE_REGULAR_EXPRESSION, source)
# def extract_package_manager_badgeds(readme_text, repository_metadata: Result, source) -> Result:
# """
# Function that takes the text of a readme file and searches if there are package manager badges.
# Parameters
# ----------
# @param readme_text: Text of the readme
# @param repository_metadata: Result with all the findings in the repo
# @param source: source file on top of which the extraction is performed (provenance)
# Returns
# -------
# @returns Result with the package badges found
# """
# package_manager_badges = re.findall(constants.REGEXP_READTHEDOCS_BADGES, readme_text, re.DOTALL)
# for package in package_manager_badges:
# repository_metadata.add_result(constants.CAT_DOCUMENTATION,
# {
# constants.PROP_TYPE: constants.URL,
# constants.PROP_VALUE: package
# }, 1, constants.TECHNIQUE_REGULAR_EXPRESSION, source)


return repository_metadata
# return repository_metadata


def extract_swh_badges(readme_text, repository_metadata: Result, source) -> Result:
Expand Down
6 changes: 4 additions & 2 deletions src/somef/test/test_bower_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def test_parse_bower_json(self):
metadata_result = parse_bower_json_file(bower_file_path, result, "https://example.org/bower.json")
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
self.assertEqual(package_results[0]["result"]["value"], "bower.json")
# self.assertEqual(package_results[0]["result"]["value"], "bower.json")
self.assertEqual(package_results[0]["result"]["value"], "https://example.org/bower.json")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)

name_results = metadata_result.results.get(constants.CAT_NAME, [])
Expand Down Expand Up @@ -73,7 +74,8 @@ def test_parse_2_bower_json(self):
metadata_result = parse_bower_json_file(bower_file_path, result, "https://example.org/bower.json")
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
self.assertEqual(package_results[0]["result"]["value"], "bower.json")
# self.assertEqual(package_results[0]["result"]["value"], "bower.json")
self.assertEqual(package_results[0]["result"]["value"], "https://example.org/bower.json")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)

name_results = metadata_result.results.get(constants.CAT_NAME, [])
Expand Down
6 changes: 4 additions & 2 deletions src/somef/test/test_cabal_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def test_parse_cabal(self):

package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
self.assertEqual(package_results[0]["result"]["value"], "unused.cabal")
# self.assertEqual(package_results[0]["result"]["value"], "unused.cabal")
self.assertEqual(package_results[0]["result"]["value"], "https://example.org/unused.cabal")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)

id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
Expand Down Expand Up @@ -60,7 +61,8 @@ def test_parse_2_cabal(self):

package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
self.assertEqual(package_results[0]["result"]["value"], "cabal.cabal")
# self.assertEqual(package_results[0]["result"]["value"], "cabal.cabal")
self.assertEqual(package_results[0]["result"]["value"], "https://example.org/cabal.cabal")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
description_results = metadata_result.results.get(constants.CAT_DESCRIPTION, [])
self.assertTrue(len(description_results) > 0, "No description found")
Expand Down
5 changes: 3 additions & 2 deletions src/somef/test/test_description_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def test_description(self):

package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
self.assertEqual(package_results[0]["result"]["value"], "DESCRIPTION")
# self.assertEqual(package_results[0]["result"]["value"], "DESCRIPTION")
self.assertEqual(package_results[0]["result"]["value"], "https://example.org/DESCRIPTION")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this looking into example.org?
If somef is run locally and there is no repo, it should return the path to the file...

self.assertEqual(package_results[0]["result"]["type"], constants.URL)

id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
Expand Down Expand Up @@ -59,7 +60,7 @@ def test_description_2(self):

package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
self.assertEqual(package_results[0]["result"]["value"], "DESCRIPTION")
self.assertEqual(package_results[0]["result"]["value"], "https://example.org/DESCRIPTION")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)

id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
Expand Down
3 changes: 2 additions & 1 deletion src/somef/test/test_gemspec_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def test_parse_gemspec(self):

package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
self.assertEqual(package_results[0]["result"]["value"], "bootstrap-datepicker-rails.gemspec")
# self.assertEqual(package_results[0]["result"]["value"], "bootstrap-datepicker-rails.gemspec")
self.assertEqual(package_results[0]["result"]["value"], "https://example.org/bootstrap-datepicker-rails.gemspec")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

self.assertEqual(package_results[0]["result"]["type"], constants.URL)

id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
Expand Down
3 changes: 2 additions & 1 deletion src/somef/test/test_package_json_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ def test_parse_package_json_file(self):

package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
self.assertEqual(package_results[0]["result"]["value"], "package.json")
# self.assertEqual(package_results[0]["result"]["value"], "package.json")
self.assertEqual(package_results[0]["result"]["value"], "http://example.com/package_neors.json")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

self.assertEqual(package_results[0]["result"]["type"], constants.URL)

keywords_results = metadata_result.results.get(constants.CAT_KEYWORDS, [])
Expand Down
3 changes: 2 additions & 1 deletion src/somef/test/test_pom_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def test_parse_pom_file(self):

package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
self.assertEqual(package_results[0]["result"]["value"], "pom.xml")
# self.assertEqual(package_results[0]["result"]["value"], "pom.xml")
self.assertEqual(package_results[0]["result"]["value"], "https://example.org/pom.xml")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)

requirements_results = metadata_result.results.get(constants.CAT_REQUIREMENTS, [])
Expand Down
18 changes: 16 additions & 2 deletions src/somef/test/test_regular_expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,7 @@ def test_issue_771(self):
def test_issue_860(self):
"""Test designed to check redthedocs links are extracted correctly when multiple links are present """
documentation_values = []
package_values = []

for readme_file in ["README-menty.md", "README-uncbiag.md"]:
with open(test_data_path + readme_file, "r") as data_file:
Expand All @@ -461,14 +462,27 @@ def test_issue_860(self):
if isinstance(value, str):
documentation_values.append(value)

if "package_distribution" in documentation.results:
for result in documentation.results["package_distribution"]:
value = result.get("result", {}).get("value")
if isinstance(value, str):
package_values.append(value)

expected_doc_urls = {
"https://pypi.org/project/mentpy",
"https://docs.mentpy.com/en/latest/?badge=latest",
"https://icon.readthedocs.io/en/master/"
}


expected_package_urls = {
"https://pypi.org/project/mentpy"
}

assert expected_doc_urls.issubset(set(documentation_values)), (
f"Expected documentation URLs {expected_doc_urls} not found in {documentation_values}"
)
assert expected_package_urls.issubset(set(package_values)), (
f"Pypy package {expected_package_urls} not foun in package_distribution: {package_values}"
)

def test_readme_rst_readthedocs(self):
"""Test designed to check whether rst readmes get stuck in extracting documentation """
Expand Down
14 changes: 10 additions & 4 deletions src/somef/test/test_toml_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,15 @@ def test_parse_cargo_toml(self):
)

result = Result()
parse_toml_file(self.cargo_toml_path, result, "test")
# parse_toml_file(self.cargo_toml_path, result, "test")
print("self.cargo_toml_path:", self.cargo_toml_path)
parse_toml_file(self.cargo_toml_path, result, "http://example.com/rustdesk/Cargo.toml")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above


self.assertIn(constants.CAT_HAS_PACKAGE_FILE, result.results)
package_file = result.results[constants.CAT_HAS_PACKAGE_FILE][0]["result"]["value"]
self.assertEqual(package_file, "Cargo.toml")

# self.assertEqual(package_file, "Cargo.toml")
self.assertEqual(package_file, "http://example.com/rustdesk/Cargo.toml")

self.assertIn(constants.CAT_PACKAGE_ID, result.results)
package_id = result.results[constants.CAT_PACKAGE_ID][0]["result"]["value"]
Expand Down Expand Up @@ -102,7 +106,8 @@ def test_parse_pluto_project_toml(self):

package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
self.assertEqual(package_results[0]["result"]["value"], "Project.toml")
# self.assertEqual(package_results[0]["result"]["value"], "Project.toml")
self.assertEqual(package_results[0]["result"]["value"], "http://example.com/repo1/Project.toml")
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
self.assertEqual(package_results[0]["technique"], constants.TECHNIQUE_CODE_CONFIG_PARSER)

Expand Down Expand Up @@ -168,7 +173,8 @@ def test_parse_flux_project_toml(self):

package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
self.assertEqual(package_results[0]["result"]["value"], "Project.toml")
# self.assertEqual(package_results[0]["result"]["value"], "Project.toml")
self.assertEqual(package_results[0]["result"]["value"], "http://example.com/repo2/Project.toml")

package_id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
self.assertTrue(len(package_id_results) > 0, "No package ID found")
Expand Down
12 changes: 6 additions & 6 deletions src/somef/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,12 @@
REGEXP_READTHEDOCS_MD = (
r"\(\s*(https?://[^\s\)]+\.readthedocs\.io[^\s\)]*)\s*\)"
)
# REGEXP_READTHEDOCS_HTML = (
# r"<a[^>]+href=['\"](https?://[^\s\"']+\.readthedocs\.io[^\s\"']*)['\"][^>]*>"
# r"(?:\s|<[^>]+>)*"
# r"<img[^>]+src=['\"]https?://readthedocs\.org/projects/[^\s/]+/badge/[^\s\"']*"
# )
REGEXP_READTHEDOCS_HTML = ( r"<a[^>]+href=['\"](https?://[^\s\"']+)['\"][^>]*>" r"(?:\s|<[^>]+>)*" r"<img[^>]+src=['\"]https?://readthedocs\.org/projects/[^\s/]+/badge/[^\s\"']*" )

REGEXP_READTHEDOCS_HTML = (
r"<a\b[^>]+?href=['\"](https?://[^'\"\s]+?)['\"][^>]*?>"
r"(?:(?!</a>)[\s\S])*?"
r"<img\b[^>]+?src=['\"]https?://(?:readthedocs\.org/projects/|img\.shields\.io/pypi/)[^'\"\s]*"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This regular expression is potentially dangerous, and it may be slow.
Try this one instead:

r"""
    <a\b
        [^>]*\bhref=['"](https?://[^'"\s]+)['"]
        [^>]*>
    (?:
        [^<]+             
        |
        <(?!/a\b)[^>]*>   
    )*
    <img\b
        [^>]*\bsrc=['"]
        https?://(?:readthedocs\.org/projects/|img\.shields\.io/pypi/)
        [^'"\s>]+
        ['"]
    """

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, i'll try again after all test pass

)
# For natural language citation
REGEXP_DOI_NATURAL = r'10\.\d{4,9}/[-._;()/:A-Za-z0-9]+'
REGEXP_YEAR_NATURAL = r'\b(19|20)\d{2}\b'
Expand Down
Loading