From 976c5adb1b39ccd9e783283a5d9a252a9dcbae5d Mon Sep 17 00:00:00 2001 From: ziad hany Date: Sat, 30 Aug 2025 05:53:53 +0300 Subject: [PATCH 1/4] Add initial improver for collect repo fix commits Signed-off-by: ziad hany --- vulnerabilities/improvers/__init__.py | 4 + .../v2_improvers/collect_repo_fix_commits.py | 185 ++++++++++++++++ .../pipelines/v2_improvers/extract_commits.py | 202 ++++++++++++++++++ .../pipelines/v2_improvers/issue_tracker_.py | 76 +++++++ 4 files changed, 467 insertions(+) create mode 100644 vulnerabilities/pipelines/v2_improvers/collect_repo_fix_commits.py create mode 100644 vulnerabilities/pipelines/v2_improvers/extract_commits.py create mode 100644 vulnerabilities/pipelines/v2_improvers/issue_tracker_.py diff --git a/vulnerabilities/improvers/__init__.py b/vulnerabilities/improvers/__init__.py index 1be791241..f95cc6639 100644 --- a/vulnerabilities/improvers/__init__.py +++ b/vulnerabilities/improvers/__init__.py @@ -19,6 +19,9 @@ from vulnerabilities.pipelines import flag_ghost_packages from vulnerabilities.pipelines import populate_vulnerability_summary_pipeline from vulnerabilities.pipelines import remove_duplicate_advisories +from vulnerabilities.pipelines.v2_improvers import ( + collect_repo_fix_commits as collect_repo_fix_commits_v2, +) from vulnerabilities.pipelines.v2_improvers import compute_advisory_todo as compute_advisory_todo_v2 from vulnerabilities.pipelines.v2_improvers import compute_package_risk as compute_package_risk_v2 from vulnerabilities.pipelines.v2_improvers import ( @@ -67,6 +70,7 @@ compute_package_risk_v2.ComputePackageRiskPipeline, compute_version_rank_v2.ComputeVersionRankPipeline, compute_advisory_todo_v2.ComputeToDo, + collect_repo_fix_commits_v2.CollectRepoFixCommitPipeline, compute_advisory_todo.ComputeToDo, ] ) diff --git a/vulnerabilities/pipelines/v2_improvers/collect_repo_fix_commits.py b/vulnerabilities/pipelines/v2_improvers/collect_repo_fix_commits.py new file mode 100644 index 000000000..a8eac0abb --- /dev/null +++ b/vulnerabilities/pipelines/v2_improvers/collect_repo_fix_commits.py @@ -0,0 +1,185 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# +import bisect +import re +from collections import defaultdict +from typing import List +from typing import Optional +from typing import Tuple + +from git import Commit +from git import Repo + +from vulnerabilities.models import AdvisoryV2 +from vulnerabilities.models import CodeFixV2 +from vulnerabilities.pipelines import VulnerableCodePipeline + + +class CollectRepoFixCommitPipeline(VulnerableCodePipeline): + """ + Pipeline to collect fix commits from any git repository. + """ + + pipeline_id = "repo_fix_commit_pipeline" + repositories_url = "git+https://github.com/the-tcpdump-group/tcpdump" + + @classmethod + def steps(cls): + return ( + cls.collect_fix_commits, + cls.store_fix_commits, + ) + + def classify_commit_type(self, commit) -> str: + num_parents = len(commit.parents) + if num_parents == 0: + return "root" + elif num_parents == 1: + return "normal" + else: + return "merge" + + def detect_fix_commit(self, commit) -> str: + """ + Detect whether a commit is a bug-fix or vulnerability-fix commit. + Returns: "vulnerability_fix" or "other" + """ + msg = commit.message.lower() + security_patterns = [ + # CVE identifiers + r"\bcve-[0-9]{4}-[0-9]{4,19}\b", + ] + if any(re.search(p, msg) for p in security_patterns): + return "vulnerability_fix" + return "other" + + def extract_cves(self, text: str) -> List[str]: + if not text: + return [] + cves = re.findall(r"cve-[0-9]{4}-[0-9]{4,19}", text, flags=re.IGNORECASE) + return list({cve.upper() for cve in cves}) + + def get_previous_releases( + self, + release_tags_sorted: List[Tuple[str, int]], + dates: List[int], + commit_date: int, + ) -> List[str]: + index = bisect.bisect_left(dates, commit_date) + return [tag for tag, _ in release_tags_sorted[:index]] + + def get_current_or_next_release( + self, + release_tags_sorted: List[Tuple[str, int]], + dates: List[int], + commit_date: int, + ) -> Optional[str]: + index = bisect.bisect_left(dates, commit_date) + + if index < len(dates) and dates[index] == commit_date: + return release_tags_sorted[index][0] + + if index < len(dates): + return release_tags_sorted[index][0] + + return None + + def get_current_release( + self, repo: Repo, commit: Commit, prev_release_by_date: Optional[str] + ) -> str: + try: + return repo.git.describe("--tags", "--exact-match", commit.hexsha) + except Exception: + pass + + try: + return repo.git.describe("--tags", "--abbrev=0", "--first-parent", commit.hexsha) + except Exception: + pass + + if prev_release_by_date: + return prev_release_by_date + + return "NO_TAGS_AVAILABLE" + + def collect_fix_commits(self): + self.log("Processing git repository fix commits.") + repo_url = "https://github.com/the-tcpdump-group/tcpdump" + repo_path = "/home/ziad-hany/PycharmProjects/tcpdump" + + repo = Repo(repo_path) + cve_list = defaultdict(set) + + # Precompute release tags + release_tags = [] + for tag in repo.tags: + try: + release_tags.append((tag.name, tag.commit.committed_date)) + except Exception: + continue + + release_tags_sorted = sorted(release_tags, key=lambda x: x[1]) + dates_array = [date for _, date in release_tags_sorted] + + for commit in repo.iter_commits("--all"): + commit_type = self.classify_commit_type(commit) + fix_type = self.detect_fix_commit(commit) + + if fix_type == "vulnerability_fix" and commit_type in ["normal", "merge"]: + prev_release_list = self.get_previous_releases( + release_tags_sorted, dates_array, commit.committed_date + ) + prev_release_by_date = prev_release_list[-1] if prev_release_list else None + + curr_release = self.get_current_release(repo, commit, prev_release_by_date) + commit_info = { + "hash": commit.hexsha, + "url": repo_url + "/commit/" + commit.hexsha, + "message": commit.message.strip(), + "curr_release": curr_release, + "prev_release": prev_release_list, + "fix_type": fix_type, + } + + for cve_id in self.extract_cves(commit.message.strip()): + commit_url = f"{repo_url}/commit/{commit.hexsha}" + cve_list[cve_id].add(commit_url) + + # Save results into pipeline state + self.fix_commits = {cve: list(commits) for cve, commits in cve_list.items()} + self.log(f"Found {len(self.fix_commits)} unique CVEs with fix commits.") + + def store_fix_commits(self): + if not hasattr(self, "fix_commits"): + self.log("No fix commits collected. Run collect_fix_commits() first.") + return + + created_fix_count = 0 + + # FIXME + for vulnerability_id, commit_urls in self.fix_commits.items(): + advisories = AdvisoryV2.objects.filter(advisory_id__iendswith=vulnerability_id) + + if not advisories.exists(): + self.log(f"No advisories found for vulnerability_id: {vulnerability_id}") + continue + + for adv in advisories: + for impact in adv.impacted_packages.all(): + for package in impact.affecting_packages.all(): + for vcs_url in commit_urls: + code_fix, created = CodeFixV2.objects.get_or_create( + commits=[vcs_url], + advisory=adv, + affected_package=package, + ) + if created: + created_fix_count += 1 + + self.log(f"Stored {created_fix_count} new CodeFixV2 entries.") diff --git a/vulnerabilities/pipelines/v2_improvers/extract_commits.py b/vulnerabilities/pipelines/v2_improvers/extract_commits.py new file mode 100644 index 000000000..1d844c8eb --- /dev/null +++ b/vulnerabilities/pipelines/v2_improvers/extract_commits.py @@ -0,0 +1,202 @@ +import bisect +import json +import os +import re +from collections import defaultdict +from typing import List +from typing import Optional +from typing import Tuple + +from git import Commit +from git import Repo + + +def clone_repo(repo_url: str, clone_dir: str) -> str: + os.makedirs(clone_dir, exist_ok=True) + try: + print(f"Cloning {repo_url} into {clone_dir}...") + repo = Repo.clone_from(repo_url, clone_dir) + print("Clone successful.") + return repo.working_tree_dir + except Exception as e: + print(f"Failed to clone repository: {e}") + return "" + + +def classify_commit_type(commit) -> str: + num_parents = len(commit.parents) + if num_parents == 0: + return "root" # never a fix + elif num_parents == 1: + return "normal" # main source of fixes + else: + return "merge" # usually not a fix + + +def detect_fix_commit(commit) -> str: + """ + Detect whether a commit is a bug-fix or vulnerability-fix commit. + Returns: "vulnerability_fix", "other" + """ + msg = commit.message.lower() + + security_patterns = [ + # CVE identifiers + r"\bcve-\d{4}-\d{4,}\b", + # Explicitly marked security fixes + r"\bsecurity fix\b", + r"\bfix security issue\b", + r"\bfix(?:es)? for security\b", + # Permission / privilege escalation + r"\bprivilege escalation\b", + r"\bprivesc\b", + r"\bescalat(?:e|ion) of privilege\b", + # No New Privileges / unsafe exec + r"\bno[- ]new[- ]privs\b", + r"\bunsafe exec\b", + # Refcount / UAF (classic kernel vulns, almost always security) + r"\buse[- ]after[- ]free\b", + r"\buaf\b", + r"\brefcount (?:leak|error|overflow|underflow)\b", + r"\bdouble free\b", + # Out-of-bounds (OOB) + r"\bout[- ]of[- ]bounds\b", + r"\boob\b", + # Info leaks (security-relevant, not generic leaks) + r"\binformation leak\b", + r"\binfo leak\b", + r"\bleak (?:kernel|userns|credentials?|mnt_idmap)\b", + # Bypass + r"\bsecurity bypass\b", + r"\baccess control bypass\b", + r"\bpermission check (?:bug|fix|error)\b", + ] + + SECURITY_REGEX = re.compile("|".join(security_patterns), re.IGNORECASE) + + if SECURITY_REGEX.search(msg): + return "vulnerability_fix" + return "other" + + +def extract_cves(text: str) -> List[str]: + if not text: + return [] + cves = re.findall(r"cve-[0-9]{4}-[0-9]{4,19}", text, flags=re.IGNORECASE) + return list({cve.upper() for cve in cves}) + + +def get_previous_releases( + release_tags_sorted: List[Tuple[str, int]], dates: List[int], commit_date: int +) -> List[str]: + """ + Get all release tags with commit dates strictly before the given commit date. + release_tags_sorted: list of (tag_name, committed_date), sorted by committed_date + dates: list of commit dates (parallel to release_tags_sorted, sorted ascending) + """ + index = bisect.bisect_left(dates, commit_date) + return [tag for tag, _ in release_tags_sorted[:index]] + + +def get_current_or_next_release( + release_tags_sorted: List[Tuple[str, int]], dates: List[int], commit_date: int +) -> Optional[str]: + """ + Get the current release if commit matches a release date, + otherwise return the next release after the commit date. + """ + index = bisect.bisect_left(dates, commit_date) + + # Exact match → this commit is tagged + if index < len(dates) and dates[index] == commit_date: + return release_tags_sorted[index][0] + + # Otherwise, next release after this commit + if index < len(dates): + return release_tags_sorted[index][0] + + # No next release available + return None + + +def get_current_release(repo: Repo, commit: Commit, prev_release_by_date: Optional[str]) -> str: + """ + Return a non-null release tag for the given commit: + 1) exact tag if commit is tagged + 2) nearest reachable tag (fast, first-parent) + 3) latest prior tag by date (fallback) + 4) "NO_TAGS_AVAILABLE" if repo has no tags at all + """ + # 1) Exact tag at this commit + try: + return repo.git.describe("--tags", "--exact-match", commit.hexsha) + except Exception: + pass + + # 2) Nearest reachable tag along first-parent + try: + return repo.git.describe("--tags", "--abbrev=0", "--first-parent", commit.hexsha) + except Exception: + pass + + # 3) Fallback: latest prior tag by date + if prev_release_by_date: + return prev_release_by_date + + # 4) No tags at all + return "NO_TAGS_AVAILABLE" + + +if __name__ == "__main__": + repo_url = "https://github.com/torvalds/linux" + repo_path = "/home/ziad-hany/PycharmProjects/linux" + + repo = Repo(repo_path) + commits_data = [] + cve_list = defaultdict(set) + + # Precompute and sort release tags by commit date + release_tags = [] + for tag in repo.tags: + try: + release_tags.append((tag.name, tag.commit, tag.commit.committed_date)) + except Exception: + continue + + release_tags_sorted = sorted(release_tags, key=lambda x: x[2]) + + # For previous releases lookup (by date) + release_tags_for_previous = [(tag_name, date) for tag_name, _, date in release_tags_sorted] + dates_array = [date for _, date in release_tags_for_previous] + + for commit in repo.iter_commits("--all"): + commit_type = classify_commit_type(commit) + fix_type = detect_fix_commit(commit) + + if fix_type == "vulnerability_fix" and commit_type in ["normal", "merge"]: + # Compute "previous by date" first so we can feed it as a fallback + prev_release_list = get_previous_releases( + release_tags_for_previous, dates_array, commit.committed_date + ) + prev_release_by_date = prev_release_list[-1] if prev_release_list else None + + curr_release = get_current_release(repo, commit, prev_release_by_date) + + commit_info = { + "hash": commit.hexsha, + "url": repo_url + "/commit/" + commit.hexsha, + "message": commit.message.strip(), + "curr_release": curr_release, + "prev_release": prev_release_list, + "fix_type": fix_type, + } + print(commit_info) + commits_data.append(commit_info) + + # Optional CVE collection + for cve_id in extract_cves(commit.message.strip()): + cve_list[cve_id].add(repo_url + "/commit/" + commit.hexsha) + + result = {cve: list(commits) for cve, commits in cve_list.items()} + print(f"Found {len(result)} unique CVEs") + print(json.dumps(result, indent=2)) diff --git a/vulnerabilities/pipelines/v2_improvers/issue_tracker_.py b/vulnerabilities/pipelines/v2_improvers/issue_tracker_.py new file mode 100644 index 000000000..05edd4008 --- /dev/null +++ b/vulnerabilities/pipelines/v2_improvers/issue_tracker_.py @@ -0,0 +1,76 @@ +from abc import ABC +from abc import abstractmethod +from typing import Dict +from typing import List +from typing import Optional + +import requests + + +class IssueTrackerClient(ABC): + @abstractmethod + def get_issues(self, project: str, **kwargs) -> List[Dict]: + pass + + @abstractmethod + def get_pull_requests(self, project: str, **kwargs) -> List[Dict]: + pass + + @abstractmethod + def get_comments(self, project: str, **kwargs) -> List[Dict]: + pass + + +class IssueTrackerFactory: + @staticmethod + def create_client(platform: str, token: Optional[str] = None, **kwargs) -> IssueTrackerClient: + platform = platform.lower() + + GIT_PLATFORM_CLIENT = { + "github": GitHubClient, + } + + if platform not in GIT_PLATFORM_CLIENT: + raise ValueError(f"Unsupported platform: {platform}") + + return GIT_PLATFORM_CLIENT[platform](token=token, **kwargs) + + +class GitHubClient(IssueTrackerClient): + API_BASE = "https://api.github.com" + + def __init__(self, token: Optional[str] = None): + self.session = requests.Session() + self.session.headers.update({"Accept": "application/vnd.github.v3+json"}) + if token: + self.session.headers["Authorization"] = f"token {token}" + + def _paginate(self, url: str, params: dict = None) -> List[Dict]: + results, page = [], 1 + params = params or {} + while True: + params.update({"per_page": 100, "page": page}) + response = self.session.get(url, params=params) + response.raise_for_status() + data = response.json() + if not data: + break + results.extend(data) + page += 1 + return results + + def get_issues(self, project: str, state: str = "all") -> List[Dict]: + owner, repo = project.split("/") + url = f"{self.API_BASE}/repos/{owner}/{repo}/issues" + issues = self._paginate(url, {"state": state}) + return [i for i in issues if "pull_request" not in i] + + def get_pull_requests(self, project: str, state: str = "all") -> List[Dict]: + owner, repo = project.split("/") + url = f"{self.API_BASE}/repos/{owner}/{repo}/pulls" + return self._paginate(url, {"state": state}) + + def get_comments(self, project: str, issue_num: int) -> List[Dict]: + owner, repo = project.split("/") + url = f"{self.API_BASE}/repos/{owner}/{repo}/issues/{issue_num}/comments" + return self._paginate(url) From 082134eef160f092e5530dbd0fb5fab9d96958f3 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Mon, 13 Oct 2025 15:35:54 +0300 Subject: [PATCH 2/4] Add CollectRepoFixCommitPipeline Add a test for CollectRepoFixCommitPipeline Signed-off-by: ziad hany --- vulnerabilities/importers/__init__.py | 4 + vulnerabilities/improvers/__init__.py | 4 - .../v2_importers/collect_repo_fix_commits.py | 117 +++++++++++ .../v2_improvers/collect_repo_fix_commits.py | 185 ------------------ .../v2_importers/test_collect_fix_commit.py | 124 ++++++++++++ .../expected_linux_advisory_output.json | 40 ++++ .../fix_commits/grouped_commits_input.json | 8 + 7 files changed, 293 insertions(+), 189 deletions(-) create mode 100644 vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py delete mode 100644 vulnerabilities/pipelines/v2_improvers/collect_repo_fix_commits.py create mode 100644 vulnerabilities/tests/pipelines/v2_importers/test_collect_fix_commit.py create mode 100644 vulnerabilities/tests/test_data/fix_commits/expected_linux_advisory_output.json create mode 100644 vulnerabilities/tests/test_data/fix_commits/grouped_commits_input.json diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index 82ee4525a..63e6f80c7 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -43,6 +43,9 @@ from vulnerabilities.pipelines import pysec_importer from vulnerabilities.pipelines.v2_importers import apache_httpd_importer as apache_httpd_v2 from vulnerabilities.pipelines.v2_importers import archlinux_importer as archlinux_importer_v2 +from vulnerabilities.pipelines.v2_importers import ( + collect_repo_fix_commits as collect_repo_fix_commits, +) from vulnerabilities.pipelines.v2_importers import curl_importer as curl_importer_v2 from vulnerabilities.pipelines.v2_importers import ( elixir_security_importer as elixir_security_importer_v2, @@ -115,5 +118,6 @@ ubuntu_usn.UbuntuUSNImporter, fireeye.FireyeImporter, oss_fuzz.OSSFuzzImporter, + collect_repo_fix_commits.CollectRepoFixCommitPipeline, ] ) diff --git a/vulnerabilities/improvers/__init__.py b/vulnerabilities/improvers/__init__.py index f95cc6639..1be791241 100644 --- a/vulnerabilities/improvers/__init__.py +++ b/vulnerabilities/improvers/__init__.py @@ -19,9 +19,6 @@ from vulnerabilities.pipelines import flag_ghost_packages from vulnerabilities.pipelines import populate_vulnerability_summary_pipeline from vulnerabilities.pipelines import remove_duplicate_advisories -from vulnerabilities.pipelines.v2_improvers import ( - collect_repo_fix_commits as collect_repo_fix_commits_v2, -) from vulnerabilities.pipelines.v2_improvers import compute_advisory_todo as compute_advisory_todo_v2 from vulnerabilities.pipelines.v2_improvers import compute_package_risk as compute_package_risk_v2 from vulnerabilities.pipelines.v2_improvers import ( @@ -70,7 +67,6 @@ compute_package_risk_v2.ComputePackageRiskPipeline, compute_version_rank_v2.ComputeVersionRankPipeline, compute_advisory_todo_v2.ComputeToDo, - collect_repo_fix_commits_v2.CollectRepoFixCommitPipeline, compute_advisory_todo.ComputeToDo, ] ) diff --git a/vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py b/vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py new file mode 100644 index 000000000..8c6671a31 --- /dev/null +++ b/vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py @@ -0,0 +1,117 @@ +import os +import re +import shutil +import subprocess +import tempfile +from collections import defaultdict + +from git import Repo + +from vulnerabilities.importer import AdvisoryData +from vulnerabilities.importer import ReferenceV2 +from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 + +SECURITY_PATTERNS = [ + r"\bCVE-\d{4}-\d{4,19}\b", + r"\bGHSA-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}\b", + r"\bPYSEC-\d{4}-\d{1,6}\b", + r"\bXSA-\d{1,4}\b", +] + + +class CollectRepoFixCommitPipeline(VulnerableCodeBaseImporterPipelineV2): + """ + Pipeline to collect fix commits from any git repository. + """ + + pipeline_id = "repo_fix_commit" + + @classmethod + def steps(cls): + return ( + cls.clone, + cls.collect_and_store_advisories, + cls.clean_downloads, + ) + + def clone(self): + """Clone the repository.""" + self.repo_url = "https://github.com/torvalds/linux" + repo_path = tempfile.mkdtemp() + cmd = [ + "git", + "clone", + "--bare", + "--filter=blob:none", + "--no-checkout", + self.repo_url, + repo_path, + ] + subprocess.run(cmd, check=True) + self.repo = Repo(repo_path) + + def advisories_count(self) -> int: + return int(self.repo.git.rev_list("--count", "HEAD")) + + def classify_commit_type(self, commit) -> list[str]: + """ + Extract vulnerability identifiers from a commit message. + Returns a list of matched vulnerability IDs (normalized to uppercase). + """ + matches = [] + for pattern in SECURITY_PATTERNS: + found = re.findall(pattern, commit.message, flags=re.IGNORECASE) + matches.extend(found) + return matches + + def collect_fix_commits(self): + """ + Iterate through repository commits and group them by vulnerability identifiers. + return a list with (vuln_id, [(commit_id, commit_message)]). + """ + self.log("Processing git repository fix commits (grouped by vulnerability IDs).") + + grouped_commits = defaultdict(list) + for commit in self.repo.iter_commits("--all"): + matched_ids = self.classify_commit_type(commit) + if not matched_ids: + continue + + commit_id = commit.hexsha + commit_message = commit.message.strip() + + for vuln_id in matched_ids: + grouped_commits[vuln_id].append((commit_id, commit_message)) + + self.log(f"Found {len(grouped_commits)} vulnerabilities with related commits.") + self.log("Finished processing all commits.") + return grouped_commits + + def collect_advisories(self): + """ + Generate AdvisoryData objects for each vulnerability ID grouped with its related commits. + """ + self.log("Generating AdvisoryData objects from grouped commits.") + grouped_commits = self.collect_fix_commits() + for vuln_id, commits in grouped_commits.items(): + references = [ReferenceV2(url=f"{self.repo_url}/commit/{cid}") for cid, _ in commits] + + summary_lines = [f"- {cid}: {msg}" for cid, msg in commits] + summary = f"Commits fixing {vuln_id}:\n" + "\n".join(summary_lines) + yield AdvisoryData( + advisory_id=vuln_id, + aliases=[vuln_id], + summary=summary, + references_v2=references, + url=self.repo_url, + ) + + def clean_downloads(self): + """Cleanup any temporary repository data.""" + self.log("Cleaning up local repository resources.") + if os.path.isdir(self.repo.working_tree_dir): + shutil.rmtree(path=self.repo.working_tree_dir) + + def on_failure(self): + """Ensure cleanup is always performed on failure.""" + self.clean_downloads() diff --git a/vulnerabilities/pipelines/v2_improvers/collect_repo_fix_commits.py b/vulnerabilities/pipelines/v2_improvers/collect_repo_fix_commits.py deleted file mode 100644 index a8eac0abb..000000000 --- a/vulnerabilities/pipelines/v2_improvers/collect_repo_fix_commits.py +++ /dev/null @@ -1,185 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# VulnerableCode is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/vulnerablecode for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# -import bisect -import re -from collections import defaultdict -from typing import List -from typing import Optional -from typing import Tuple - -from git import Commit -from git import Repo - -from vulnerabilities.models import AdvisoryV2 -from vulnerabilities.models import CodeFixV2 -from vulnerabilities.pipelines import VulnerableCodePipeline - - -class CollectRepoFixCommitPipeline(VulnerableCodePipeline): - """ - Pipeline to collect fix commits from any git repository. - """ - - pipeline_id = "repo_fix_commit_pipeline" - repositories_url = "git+https://github.com/the-tcpdump-group/tcpdump" - - @classmethod - def steps(cls): - return ( - cls.collect_fix_commits, - cls.store_fix_commits, - ) - - def classify_commit_type(self, commit) -> str: - num_parents = len(commit.parents) - if num_parents == 0: - return "root" - elif num_parents == 1: - return "normal" - else: - return "merge" - - def detect_fix_commit(self, commit) -> str: - """ - Detect whether a commit is a bug-fix or vulnerability-fix commit. - Returns: "vulnerability_fix" or "other" - """ - msg = commit.message.lower() - security_patterns = [ - # CVE identifiers - r"\bcve-[0-9]{4}-[0-9]{4,19}\b", - ] - if any(re.search(p, msg) for p in security_patterns): - return "vulnerability_fix" - return "other" - - def extract_cves(self, text: str) -> List[str]: - if not text: - return [] - cves = re.findall(r"cve-[0-9]{4}-[0-9]{4,19}", text, flags=re.IGNORECASE) - return list({cve.upper() for cve in cves}) - - def get_previous_releases( - self, - release_tags_sorted: List[Tuple[str, int]], - dates: List[int], - commit_date: int, - ) -> List[str]: - index = bisect.bisect_left(dates, commit_date) - return [tag for tag, _ in release_tags_sorted[:index]] - - def get_current_or_next_release( - self, - release_tags_sorted: List[Tuple[str, int]], - dates: List[int], - commit_date: int, - ) -> Optional[str]: - index = bisect.bisect_left(dates, commit_date) - - if index < len(dates) and dates[index] == commit_date: - return release_tags_sorted[index][0] - - if index < len(dates): - return release_tags_sorted[index][0] - - return None - - def get_current_release( - self, repo: Repo, commit: Commit, prev_release_by_date: Optional[str] - ) -> str: - try: - return repo.git.describe("--tags", "--exact-match", commit.hexsha) - except Exception: - pass - - try: - return repo.git.describe("--tags", "--abbrev=0", "--first-parent", commit.hexsha) - except Exception: - pass - - if prev_release_by_date: - return prev_release_by_date - - return "NO_TAGS_AVAILABLE" - - def collect_fix_commits(self): - self.log("Processing git repository fix commits.") - repo_url = "https://github.com/the-tcpdump-group/tcpdump" - repo_path = "/home/ziad-hany/PycharmProjects/tcpdump" - - repo = Repo(repo_path) - cve_list = defaultdict(set) - - # Precompute release tags - release_tags = [] - for tag in repo.tags: - try: - release_tags.append((tag.name, tag.commit.committed_date)) - except Exception: - continue - - release_tags_sorted = sorted(release_tags, key=lambda x: x[1]) - dates_array = [date for _, date in release_tags_sorted] - - for commit in repo.iter_commits("--all"): - commit_type = self.classify_commit_type(commit) - fix_type = self.detect_fix_commit(commit) - - if fix_type == "vulnerability_fix" and commit_type in ["normal", "merge"]: - prev_release_list = self.get_previous_releases( - release_tags_sorted, dates_array, commit.committed_date - ) - prev_release_by_date = prev_release_list[-1] if prev_release_list else None - - curr_release = self.get_current_release(repo, commit, prev_release_by_date) - commit_info = { - "hash": commit.hexsha, - "url": repo_url + "/commit/" + commit.hexsha, - "message": commit.message.strip(), - "curr_release": curr_release, - "prev_release": prev_release_list, - "fix_type": fix_type, - } - - for cve_id in self.extract_cves(commit.message.strip()): - commit_url = f"{repo_url}/commit/{commit.hexsha}" - cve_list[cve_id].add(commit_url) - - # Save results into pipeline state - self.fix_commits = {cve: list(commits) for cve, commits in cve_list.items()} - self.log(f"Found {len(self.fix_commits)} unique CVEs with fix commits.") - - def store_fix_commits(self): - if not hasattr(self, "fix_commits"): - self.log("No fix commits collected. Run collect_fix_commits() first.") - return - - created_fix_count = 0 - - # FIXME - for vulnerability_id, commit_urls in self.fix_commits.items(): - advisories = AdvisoryV2.objects.filter(advisory_id__iendswith=vulnerability_id) - - if not advisories.exists(): - self.log(f"No advisories found for vulnerability_id: {vulnerability_id}") - continue - - for adv in advisories: - for impact in adv.impacted_packages.all(): - for package in impact.affecting_packages.all(): - for vcs_url in commit_urls: - code_fix, created = CodeFixV2.objects.get_or_create( - commits=[vcs_url], - advisory=adv, - affected_package=package, - ) - if created: - created_fix_count += 1 - - self.log(f"Stored {created_fix_count} new CodeFixV2 entries.") diff --git a/vulnerabilities/tests/pipelines/v2_importers/test_collect_fix_commit.py b/vulnerabilities/tests/pipelines/v2_importers/test_collect_fix_commit.py new file mode 100644 index 000000000..6f6b1b8b1 --- /dev/null +++ b/vulnerabilities/tests/pipelines/v2_importers/test_collect_fix_commit.py @@ -0,0 +1,124 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +from pathlib import Path +from unittest import TestCase +from unittest.mock import MagicMock +from unittest.mock import patch + +import pytest + +from vulnerabilities.pipelines.v2_importers.collect_repo_fix_commits import ( + CollectRepoFixCommitPipeline, +) +from vulnerabilities.tests import util_tests + + +@pytest.fixture +def pipeline(): + pipeline = CollectRepoFixCommitPipeline() + pipeline.repo_url = "https://github.com/test/repo" + pipeline.log = MagicMock() + return pipeline + + +def test_classify_commit_type_extracts_ids(pipeline): + class DummyCommit: + message = "Fix for CVE-2023-1234 and GHSA-2479-qvv7-47qq" + + result = pipeline.classify_commit_type(DummyCommit) + assert result == ["CVE-2023-1234", "GHSA-2479-qvv7-47qq"] + + +@patch("vulnerabilities.pipelines.v2_importers.collect_repo_fix_commits.Repo") +def test_collect_fix_commits_groups_by_vuln(mock_repo, pipeline): + commit1 = MagicMock(message="Fix CVE-2021-0001", hexsha="abc123") + commit2 = MagicMock(message="Patch GHSA-dead-beef-baad", hexsha="def456") + commit3 = MagicMock(message="Unrelated change", hexsha="ghi789") + + pipeline.repo = MagicMock() + pipeline.repo.iter_commits.return_value = [commit1, commit2, commit3] + + pipeline.classify_commit_type = MagicMock( + side_effect=lambda c: ( + ["CVE-2021-0001"] + if "CVE" in c.message + else ["GHSA-dead-beef-baad"] + if "GHSA" in c.message + else [] + ) + ) + + grouped = pipeline.collect_fix_commits() + + expected = { + "CVE-2021-0001": [("abc123", "Fix CVE-2021-0001")], + "GHSA-dead-beef-baad": [("def456", "Patch GHSA-dead-beef-baad")], + } + + assert grouped == expected + + +TEST_DATA = Path(__file__).parent.parent.parent / "test_data" / "fix_commits" + + +class TestRepoFixCommitPipeline(TestCase): + def test_collect_advisories_from_json(self): + input_file = TEST_DATA / "grouped_commits_input.json" + expected_file = TEST_DATA / "expected_linux_advisory_output.json" + + grouped_commits = json.loads(input_file.read_text(encoding="utf-8")) + + pipeline = CollectRepoFixCommitPipeline() + pipeline.repo_url = "https://github.com/test/repo" + pipeline.log = MagicMock() + pipeline.collect_fix_commits = MagicMock(return_value=grouped_commits) + + result = [adv.to_dict() for adv in pipeline.collect_advisories()] + + util_tests.check_results_against_json(result, expected_file) + + +@pytest.mark.parametrize( + "commit_message, expected_ids", + [ + ("Fix CVE-2023-12345 buffer overflow", ["CVE-2023-12345"]), + ("Address GHSA-abcd-1234-efgh report", ["GHSA-abcd-1234-efgh"]), + ("Python security PYSEC-2021-12345 fix", ["PYSEC-2021-12345"]), + ("Xen XSA-43 security update", ["XSA-43"]), + ( + "Fix CVE-2023-1111 and GHSA-aaaa-bbbb-cccc in kernel", + ["CVE-2023-1111", "GHSA-aaaa-bbbb-cccc"], + ), + ("Refactor logging system with no security ID", []), + ], +) +def test_classify_commit_type_detects_vuln_ids(pipeline, commit_message, expected_ids): + """Ensure classify_commit_type correctly extracts vulnerability IDs.""" + + class DummyCommit: + def __init__(self, message): + self.message = message + + commit = DummyCommit(commit_message) + result = pipeline.classify_commit_type(commit) + + assert result == expected_ids, f"Unexpected result for message: {commit_message}" + + +def test_classify_commit_type_case_insensitive(pipeline): + """Ensure pattern matching is case-insensitive.""" + + class DummyCommit: + message = "fix cVe-2022-9999 and ghSa-dead-beef-baad" + + result = pipeline.classify_commit_type(DummyCommit) + assert any("CVE-2022-9999" in r.upper() for r in result) + assert any("GHSA-DEAD-BEEF-BAAD" in r.upper() for r in result) diff --git a/vulnerabilities/tests/test_data/fix_commits/expected_linux_advisory_output.json b/vulnerabilities/tests/test_data/fix_commits/expected_linux_advisory_output.json new file mode 100644 index 000000000..c34dc05aa --- /dev/null +++ b/vulnerabilities/tests/test_data/fix_commits/expected_linux_advisory_output.json @@ -0,0 +1,40 @@ +[ + { + "advisory_id": "CVE-2021-0001", + "aliases": [ + "CVE-2021-0001" + ], + "summary": "Commits fixing CVE-2021-0001:\n- abc123: Fix CVE-2021-0001", + "affected_packages": [], + "references_v2": [ + { + "reference_id": "", + "reference_type": "", + "url": "https://github.com/test/repo/commit/abc123" + } + ], + "severities": [], + "date_published": null, + "weaknesses": [], + "url": "https://github.com/test/repo" + }, + { + "advisory_id": "GHSA-dead-beef-baad", + "aliases": [ + "GHSA-dead-beef-baad" + ], + "summary": "Commits fixing GHSA-dead-beef-baad:\n- def456: Patch GHSA-dead-beef-baad", + "affected_packages": [], + "references_v2": [ + { + "reference_id": "", + "reference_type": "", + "url": "https://github.com/test/repo/commit/def456" + } + ], + "severities": [], + "date_published": null, + "weaknesses": [], + "url": "https://github.com/test/repo" + } +] \ No newline at end of file diff --git a/vulnerabilities/tests/test_data/fix_commits/grouped_commits_input.json b/vulnerabilities/tests/test_data/fix_commits/grouped_commits_input.json new file mode 100644 index 000000000..9c49d65a4 --- /dev/null +++ b/vulnerabilities/tests/test_data/fix_commits/grouped_commits_input.json @@ -0,0 +1,8 @@ +{ + "CVE-2021-0001": [ + ["abc123", "Fix CVE-2021-0001"] + ], + "GHSA-dead-beef-baad": [ + ["def456", "Patch GHSA-dead-beef-baad"] + ] +} \ No newline at end of file From 9c20737bd221b5b618f982d0786dc9e572310803 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Mon, 13 Oct 2025 15:43:41 +0300 Subject: [PATCH 3/4] Remove a test script related to fix commits and issue tracker Signed-off-by: ziad hany --- .../pipelines/v2_improvers/extract_commits.py | 202 ------------------ .../pipelines/v2_improvers/issue_tracker_.py | 76 ------- 2 files changed, 278 deletions(-) delete mode 100644 vulnerabilities/pipelines/v2_improvers/extract_commits.py delete mode 100644 vulnerabilities/pipelines/v2_improvers/issue_tracker_.py diff --git a/vulnerabilities/pipelines/v2_improvers/extract_commits.py b/vulnerabilities/pipelines/v2_improvers/extract_commits.py deleted file mode 100644 index 1d844c8eb..000000000 --- a/vulnerabilities/pipelines/v2_improvers/extract_commits.py +++ /dev/null @@ -1,202 +0,0 @@ -import bisect -import json -import os -import re -from collections import defaultdict -from typing import List -from typing import Optional -from typing import Tuple - -from git import Commit -from git import Repo - - -def clone_repo(repo_url: str, clone_dir: str) -> str: - os.makedirs(clone_dir, exist_ok=True) - try: - print(f"Cloning {repo_url} into {clone_dir}...") - repo = Repo.clone_from(repo_url, clone_dir) - print("Clone successful.") - return repo.working_tree_dir - except Exception as e: - print(f"Failed to clone repository: {e}") - return "" - - -def classify_commit_type(commit) -> str: - num_parents = len(commit.parents) - if num_parents == 0: - return "root" # never a fix - elif num_parents == 1: - return "normal" # main source of fixes - else: - return "merge" # usually not a fix - - -def detect_fix_commit(commit) -> str: - """ - Detect whether a commit is a bug-fix or vulnerability-fix commit. - Returns: "vulnerability_fix", "other" - """ - msg = commit.message.lower() - - security_patterns = [ - # CVE identifiers - r"\bcve-\d{4}-\d{4,}\b", - # Explicitly marked security fixes - r"\bsecurity fix\b", - r"\bfix security issue\b", - r"\bfix(?:es)? for security\b", - # Permission / privilege escalation - r"\bprivilege escalation\b", - r"\bprivesc\b", - r"\bescalat(?:e|ion) of privilege\b", - # No New Privileges / unsafe exec - r"\bno[- ]new[- ]privs\b", - r"\bunsafe exec\b", - # Refcount / UAF (classic kernel vulns, almost always security) - r"\buse[- ]after[- ]free\b", - r"\buaf\b", - r"\brefcount (?:leak|error|overflow|underflow)\b", - r"\bdouble free\b", - # Out-of-bounds (OOB) - r"\bout[- ]of[- ]bounds\b", - r"\boob\b", - # Info leaks (security-relevant, not generic leaks) - r"\binformation leak\b", - r"\binfo leak\b", - r"\bleak (?:kernel|userns|credentials?|mnt_idmap)\b", - # Bypass - r"\bsecurity bypass\b", - r"\baccess control bypass\b", - r"\bpermission check (?:bug|fix|error)\b", - ] - - SECURITY_REGEX = re.compile("|".join(security_patterns), re.IGNORECASE) - - if SECURITY_REGEX.search(msg): - return "vulnerability_fix" - return "other" - - -def extract_cves(text: str) -> List[str]: - if not text: - return [] - cves = re.findall(r"cve-[0-9]{4}-[0-9]{4,19}", text, flags=re.IGNORECASE) - return list({cve.upper() for cve in cves}) - - -def get_previous_releases( - release_tags_sorted: List[Tuple[str, int]], dates: List[int], commit_date: int -) -> List[str]: - """ - Get all release tags with commit dates strictly before the given commit date. - release_tags_sorted: list of (tag_name, committed_date), sorted by committed_date - dates: list of commit dates (parallel to release_tags_sorted, sorted ascending) - """ - index = bisect.bisect_left(dates, commit_date) - return [tag for tag, _ in release_tags_sorted[:index]] - - -def get_current_or_next_release( - release_tags_sorted: List[Tuple[str, int]], dates: List[int], commit_date: int -) -> Optional[str]: - """ - Get the current release if commit matches a release date, - otherwise return the next release after the commit date. - """ - index = bisect.bisect_left(dates, commit_date) - - # Exact match → this commit is tagged - if index < len(dates) and dates[index] == commit_date: - return release_tags_sorted[index][0] - - # Otherwise, next release after this commit - if index < len(dates): - return release_tags_sorted[index][0] - - # No next release available - return None - - -def get_current_release(repo: Repo, commit: Commit, prev_release_by_date: Optional[str]) -> str: - """ - Return a non-null release tag for the given commit: - 1) exact tag if commit is tagged - 2) nearest reachable tag (fast, first-parent) - 3) latest prior tag by date (fallback) - 4) "NO_TAGS_AVAILABLE" if repo has no tags at all - """ - # 1) Exact tag at this commit - try: - return repo.git.describe("--tags", "--exact-match", commit.hexsha) - except Exception: - pass - - # 2) Nearest reachable tag along first-parent - try: - return repo.git.describe("--tags", "--abbrev=0", "--first-parent", commit.hexsha) - except Exception: - pass - - # 3) Fallback: latest prior tag by date - if prev_release_by_date: - return prev_release_by_date - - # 4) No tags at all - return "NO_TAGS_AVAILABLE" - - -if __name__ == "__main__": - repo_url = "https://github.com/torvalds/linux" - repo_path = "/home/ziad-hany/PycharmProjects/linux" - - repo = Repo(repo_path) - commits_data = [] - cve_list = defaultdict(set) - - # Precompute and sort release tags by commit date - release_tags = [] - for tag in repo.tags: - try: - release_tags.append((tag.name, tag.commit, tag.commit.committed_date)) - except Exception: - continue - - release_tags_sorted = sorted(release_tags, key=lambda x: x[2]) - - # For previous releases lookup (by date) - release_tags_for_previous = [(tag_name, date) for tag_name, _, date in release_tags_sorted] - dates_array = [date for _, date in release_tags_for_previous] - - for commit in repo.iter_commits("--all"): - commit_type = classify_commit_type(commit) - fix_type = detect_fix_commit(commit) - - if fix_type == "vulnerability_fix" and commit_type in ["normal", "merge"]: - # Compute "previous by date" first so we can feed it as a fallback - prev_release_list = get_previous_releases( - release_tags_for_previous, dates_array, commit.committed_date - ) - prev_release_by_date = prev_release_list[-1] if prev_release_list else None - - curr_release = get_current_release(repo, commit, prev_release_by_date) - - commit_info = { - "hash": commit.hexsha, - "url": repo_url + "/commit/" + commit.hexsha, - "message": commit.message.strip(), - "curr_release": curr_release, - "prev_release": prev_release_list, - "fix_type": fix_type, - } - print(commit_info) - commits_data.append(commit_info) - - # Optional CVE collection - for cve_id in extract_cves(commit.message.strip()): - cve_list[cve_id].add(repo_url + "/commit/" + commit.hexsha) - - result = {cve: list(commits) for cve, commits in cve_list.items()} - print(f"Found {len(result)} unique CVEs") - print(json.dumps(result, indent=2)) diff --git a/vulnerabilities/pipelines/v2_improvers/issue_tracker_.py b/vulnerabilities/pipelines/v2_improvers/issue_tracker_.py deleted file mode 100644 index 05edd4008..000000000 --- a/vulnerabilities/pipelines/v2_improvers/issue_tracker_.py +++ /dev/null @@ -1,76 +0,0 @@ -from abc import ABC -from abc import abstractmethod -from typing import Dict -from typing import List -from typing import Optional - -import requests - - -class IssueTrackerClient(ABC): - @abstractmethod - def get_issues(self, project: str, **kwargs) -> List[Dict]: - pass - - @abstractmethod - def get_pull_requests(self, project: str, **kwargs) -> List[Dict]: - pass - - @abstractmethod - def get_comments(self, project: str, **kwargs) -> List[Dict]: - pass - - -class IssueTrackerFactory: - @staticmethod - def create_client(platform: str, token: Optional[str] = None, **kwargs) -> IssueTrackerClient: - platform = platform.lower() - - GIT_PLATFORM_CLIENT = { - "github": GitHubClient, - } - - if platform not in GIT_PLATFORM_CLIENT: - raise ValueError(f"Unsupported platform: {platform}") - - return GIT_PLATFORM_CLIENT[platform](token=token, **kwargs) - - -class GitHubClient(IssueTrackerClient): - API_BASE = "https://api.github.com" - - def __init__(self, token: Optional[str] = None): - self.session = requests.Session() - self.session.headers.update({"Accept": "application/vnd.github.v3+json"}) - if token: - self.session.headers["Authorization"] = f"token {token}" - - def _paginate(self, url: str, params: dict = None) -> List[Dict]: - results, page = [], 1 - params = params or {} - while True: - params.update({"per_page": 100, "page": page}) - response = self.session.get(url, params=params) - response.raise_for_status() - data = response.json() - if not data: - break - results.extend(data) - page += 1 - return results - - def get_issues(self, project: str, state: str = "all") -> List[Dict]: - owner, repo = project.split("/") - url = f"{self.API_BASE}/repos/{owner}/{repo}/issues" - issues = self._paginate(url, {"state": state}) - return [i for i in issues if "pull_request" not in i] - - def get_pull_requests(self, project: str, state: str = "all") -> List[Dict]: - owner, repo = project.split("/") - url = f"{self.API_BASE}/repos/{owner}/{repo}/pulls" - return self._paginate(url, {"state": state}) - - def get_comments(self, project: str, issue_num: int) -> List[Dict]: - owner, repo = project.split("/") - url = f"{self.API_BASE}/repos/{owner}/{repo}/issues/{issue_num}/comments" - return self._paginate(url) From 5ffdf86a84078c53c08a6c856fb13f35e28fe143 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Tue, 14 Oct 2025 18:14:04 +0300 Subject: [PATCH 4/4] Resolve requested changes Signed-off-by: ziad hany --- .../v2_importers/collect_repo_fix_commits.py | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py b/vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py index 8c6671a31..521fed4ab 100644 --- a/vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py +++ b/vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py @@ -1,7 +1,5 @@ -import os import re import shutil -import subprocess import tempfile from collections import defaultdict @@ -38,17 +36,13 @@ def clone(self): """Clone the repository.""" self.repo_url = "https://github.com/torvalds/linux" repo_path = tempfile.mkdtemp() - cmd = [ - "git", - "clone", - "--bare", - "--filter=blob:none", - "--no-checkout", - self.repo_url, - repo_path, - ] - subprocess.run(cmd, check=True) - self.repo = Repo(repo_path) + self.repo = Repo.clone_from( + url=self.repo_url, + to_path=repo_path, + bare=True, + no_checkout=True, + multi_options=["--filter=blob:none"], + ) def advisories_count(self) -> int: return int(self.repo.git.rev_list("--count", "HEAD")) @@ -109,8 +103,8 @@ def collect_advisories(self): def clean_downloads(self): """Cleanup any temporary repository data.""" self.log("Cleaning up local repository resources.") - if os.path.isdir(self.repo.working_tree_dir): - shutil.rmtree(path=self.repo.working_tree_dir) + if hasattr(self, "repo") and self.repo.working_dir: + shutil.rmtree(path=self.repo.working_dir) def on_failure(self): """Ensure cleanup is always performed on failure."""