Refactor data collection into lifetime and sprint workflows

dhyana6466 · dhyana6466 · commit 8ab19b42a709 · 2026-02-20T15:55:20.000-06:00
diff --git a/.github/workflows/lifetime_data.yml b/.github/workflows/lifetime_data.yml
@@ -0,0 +1,44 @@
+# Runs twice a month to collect fill repository history
+# This updates the long-term health data used by the dashboard
+
+name: Lifetime Data Collection
+
+on:
+  schedule:
+    # Runs at midnight on the 1st and 15th of every month
+    - cron: "0 0 1,15 * *"
+  workflow_dispatch:
+
+jobs:
+  collect-lifetime-data:
+    runs-on: ubuntu-latest
+
+    steps:
+      # Checkout the Data_Updates branch (never push to main)
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          ref: Data_Updates
+        
+      # Set up Python environment
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      # Install project dependencies
+      - name: Install dependencies
+        run: pip install -r requirements.txt
+      
+      # Run data collection in lifetime mode
+      - name: Run lifetime data collection
+        run: python Backend/dataCollection/collectData.py --mode lifetime
+
+      # Commit and push updated JSON file
+      - name: Commit and push changes
+        run: |
+          git config user.name "github-actions"
+          git config user.email "actions@github.com"
+          git add data/lifetime_data.json
+          git commit -m "Update lifetime repository data"
+          git push origin Data_Updates
diff --git a/.github/workflows/sprint_data.yml b/.github/workflows/sprint_data.yml
@@ -0,0 +1,45 @@
+# Runs during active sprints
+# Collects sprint-level data for the Team Stats page
+
+name: Sprint Data Collection
+
+on:
+  schedule:
+    # Runs every Monday at midnight
+    # The script itself checks if a sprint is active
+    - cron: "0 0 * *. 1"
+  workflow_dispatch:
+
+jobs:
+  collect-sprint-data:
+    runs-on: ubuntu-latest
+
+    steps:
+      # Checkout the Data_Updates branch (never push to main)
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          ref: Data_Updates
+      
+      # Set up Python environment
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      # Install project dependencies
+      - name: Install dependencies
+        run: pip install -r requirements.txt
+
+      # Run data collection in sprint mode
+      - name: Run sprint data collection
+        run: python Backend/dataCollection/collectData.py --mode lifetime
+      
+      # Commit and push updated JSOn file
+      - name: Commit and push changes
+        run: |
+          git config user.name "github-actions"
+          git config user.email "actions@github.com"
+          git add data/sprint_data.json
+          git commit -m "Update sprint data"
+          git push origin Data_Updates
diff --git a/Backend/config/configs.py b/Backend/config/configs.py
@@ -1,14 +1,12 @@
-from configparser import ConfigParser
 import os
 from dotenv import load_dotenv
 from pathlib import Path
 
-# having issues finding the .env file when doing local testing, according to my research this should not impact the actions
-env_path = Path(__file__).parent / ".env"
-#Getting API Token to be used in other files
-load_dotenv()
-GIT_TOKEN = os.getenv('GIT_TOKEN')
+# Explicitly load .env from project root
+project_root = Path(__file__).resolve().parent.parent.parent
+load_dotenv(project_root / ".env")
 
-if not GIT_TOKEN:
-    raise ValueError("GIT_TOKEN not found! Please set it in .env file")
+GIT_TOKEN = os.getenv("GIT_TOKEN")
 
+if not GIT_TOKEN:
+    raise ValueError("GIT_TOKEN not found! Please set it in .env file")
diff --git a/Backend/dataCollection/collectData.py b/Backend/dataCollection/collectData.py
@@ -1,83 +1,123 @@
 import os
-from github import Github, Auth
-from dataCollection.formatJSON import format_json_data
-from config.configs import GIT_TOKEN
-from dataCollection.pullRequest import get_pr_data
-from dataCollection.issue import get_issue_data
-from dataCollection.commit import get_commit_data
-import pandas as pd
 import json
+import argparse
+from datetime import datetime
+from github import Github, Auth
+from Backend.config.configs import GIT_TOKEN
+from Backend.dataCollection.formatJSON import format_json_data
+from Backend.dataCollection.pullRequest import get_pr_data
+from Backend.dataCollection.issue import get_issue_data
+from Backend.dataCollection.commit import get_commit_data
 
-def test_pr_data():
-    repo_name = "oss-slu/oss_dev_analytics"
-    g = Github(auth=Auth.Token(GIT_TOKEN))
-    sprint = 7
-    pr_data = get_pr_data(g, repo_name, sprint)
-    print(pr_data.head())
-    return pr_data
-def test_issue_data():
-    repo_name = "oss-slu/oss_dev_analytics"
-    g = Github(auth=Auth.Token(GIT_TOKEN))
-    sprint = 7
-    issue_data = get_issue_data(g, repo_name, sprint)
-    with pd.option_context('display.max_rows', None, 'display.max_columns', None): #just want to make sure the df looks correct
-        print(issue_data)
-    return issue_data
-def test_commit_data():
-    repo_name = "oss-slu/oss_dev_analytics"
-    g = Github(auth=Auth.Token(GIT_TOKEN))
-    sprint = 7
-    commit_data = get_commit_data(g, repo_name, sprint)
-    with pd.option_context('display.max_rows', None, 'display.max_columns', None): #just want to make sure the df looks correct
-        print(commit_data)
-    return commit_data
 
-def dev_analytics_test():
-    repo_name = "oss-slu/oss_dev_analytics"
-    g = Github(auth=Auth.Token(GIT_TOKEN))
-    sprint = 7
-    issue_data = get_issue_data(g, repo_name, sprint)
-    pr_data = get_pr_data(g, repo_name, sprint)
-    commit_data = get_commit_data(g, repo_name, sprint)
-    out = { 
-        "issues": issue_data.to_dict(orient='records'),
-        "pull_requests": pr_data.to_dict(orient='records'),
-        "commits": commit_data.to_dict(orient='records')
-    }
-    #now send the temp .json to a function that puts it into the format we need to show analytics
-    formatted = format_json_data(out, sprint)
-    with open("test_data.json", "w") as outfile:
-        json.dump(formatted, outfile, indent=4, default=str)
-def other_repo(repo, sprint = -1):
-    repo_name = "oss-slu/"+repo
+# This function checks which sprint is currently active
+# It reads the sprint_schedule_json file and compares
+# today's date with the start and end dates
+def get_current_sprint():
+    try:
+        with open("data/sprint_schedule.json", "r") as file:
+            schedule = json.load(file)
+        
+        today = datetime.today().date()
+
+        for semester in schedule:
+            for sprint in schedule[semester]:
+                start = datetime.strptime(
+                    sprint["start"], "%Y-%m-%d"
+                ).date()
+                end = datetime.strptime(
+                    sprint["end"], "%Y-%m-%d"
+                ).date()
+
+                # If today is inside the sprint window,
+                # return that sprint number
+                if start <= today <= end:
+                    return sprint["sprint"]
+                
+        # If no sprint matches, return None
+        return None
+    
+    except Exception as e:
+        print(f"Error reading sprint schedule: {e}")
+        return None
+
+# Collect data for one repository
+# sprint = -1 means lifetime mode (full history)
+# Any other sprint number means sprint-only data
+def other_repo(repo, sprint=-1):
+    repo_name = "oss-slu/" + repo
     g = Github(auth=Auth.Token(GIT_TOKEN))
+
+    # Get issue, PR, and commit data
     issue_data = get_issue_data(g, repo_name, sprint)
     pr_data = get_pr_data(g, repo_name, sprint)
     commit_data = get_commit_data(g, repo_name, sprint)
-    out = { 
-        "issues": issue_data.to_dict(orient='records'),
-        "pull_requests": pr_data.to_dict(orient='records'),
-        "commits": commit_data.to_dict(orient='records')
+
+    # Converting DataFrame into dictionary format
+    out = {
+        "issues": issue_data.to_dict(orient="records"),
+        "pull_requests": pr_data.to_dict(orient="records"),
+        "commits": commit_data.to_dict(orient="records")
     }
+
+    # Format data into the structure the dashboard expects
     formatted = format_json_data(out, sprint)
-    path = "test_data.json"
-    #Using one org wide json file rather than a bunch of individual ones
-    #First need to check if it exists otherwise we are creating it for the first time
-    if (not os.path.exists(path)):
+
+    # Deciding which file to write to
+    # Lifetime data goes into lifetime_data.json
+    # Sprint data goes into sprint_data.json
+    path = (
+        "data/lifetime_data.json"
+        if sprint == -1
+        else "data/sprint_data.json"
+    )
+
+    # Making sure the data folder exists
+    os.makedirs("data", exist_ok=True)
+
+    # If file does not exist yet, create it
+    if not os.path.exists(path):
         with open(path, "w") as outfile:
             json.dump({}, outfile)
-        data = {}
-    else:
-        with open(path, "r") as outfile:
-            data = json.load(outfile)
-    #now it is created or loaded, we can add new repo data. Keys are the repo (for lifetime data) or repo_sprint_X for specific sprint data
+
+    # Loading existing data
+    with open(path, "r") as outfile:
+        data = json.load(outfile)
+
+    # Using repo name as key for lifetime data
+    # For sprint mode, include sprint number in the key
     key = repo if sprint == -1 else f"{repo}_sprint_{sprint}"
     data[key] = formatted
+
+    # Writing updated data back to file
     with open(path, "w") as outfile:
         json.dump(data, outfile, indent=4, default=str)
+
+# This is the entry point when the workflow runs the script
+# The workflow passes either --mode lifetime or --mode sprint
 if __name__ == "__main__":
-    #dev_analytics_test()
-    other_repo("lrda_mobile", -1)
-    other_repo("oss_dev_analytics", -1)
-    other_repo("lrda_mobile", 7)
-    other_repo("oss_dev_analytics", 7)
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode",
+        choices=["lifetime", "sprint"],
+        required=True
+    )
+    args = parser.parse_args()
+
+    repos = ["lrda_mobile", "oss_dev_analytics"]
+    
+    if args.mode == "lifetime":
+        # Run fill history collection
+        for repo in repos:
+            other_repo(repo, -1)
+
+    elif args.mode == "sprint":
+        # First check which sprint is active
+        sprint = get_current_sprint()
+
+        if sprint is None:
+            # If we are outside sprint dates, do nothing
+            print("No active sprint found")
+        else:
+            for repo in repos:
+                other_repo(repo, sprint)
diff --git a/Backend/dataCollection/commit.py b/Backend/dataCollection/commit.py
@@ -1,5 +1,5 @@
 import pandas as pd
-from dataCollection.sprintFiltering import filter_data_by_sprint
+from Backend.dataCollection.sprintFiltering import filter_data_by_sprint
 
 def get_commit_data(g, repo_name, sprint = -1):
     """
diff --git a/Backend/dataCollection/issue.py b/Backend/dataCollection/issue.py
@@ -1,5 +1,5 @@
 import pandas as pd
-from dataCollection.sprintFiltering import filter_data_by_sprint
+from Backend.dataCollection.sprintFiltering import filter_data_by_sprint
 
 
 def get_issue_data(g, repo_name, sprint = -1):
@@ -34,7 +34,11 @@ def get_issue_data(g, repo_name, sprint = -1):
 
     print(f"Issue Repository: {repo.name}") #debugging only
     issues = repo.get_issues(state="all", sort="created", direction="desc")
-    issues_filtered = filter_data_by_sprint(issues, sprint)
+    #issues_filtered = filter_data_by_sprint(issues, sprint)
+    if sprint == -1:
+        issues_filtered = issues
+    else:
+        issues_filtered = filter_data_by_sprint(issues, sprint)
     for issue in issues_filtered:
         try:
             # Skip pull requests
@@ -49,11 +53,13 @@ def get_issue_data(g, repo_name, sprint = -1):
                 time_to_close = (closed_at - created_at).total_seconds() / 3600
             else:
                 time_to_close = None
+            assigned_time = None
+
             #Get info for Cycle time calculation
-            for event in issue.get_events():
-                if event.event == "assigned":
-                    assigned_time = event.created_at
-                    break
+            # for event in issue.get_events():
+                # if event.event == "assigned":
+                   #  assigned_time = event.created_at
+                    # break
 
             issue_records.append({
                 'repository': repo.name,
@@ -81,11 +87,12 @@ def get_issue_data(g, repo_name, sprint = -1):
     dataframe['issues_opened'] = issues_opened
     dataframe['issues_closed'] = issues_closed
     dataframe['wip_issues'] = wip_issues
-    dataframe['cycle_time'] = dataframe.apply(
-        lambda row: (row['closed_at'] - row['assigned_time']).total_seconds() / 3600
-        if pd.notnull(row['closed_at']) and pd.notnull(row['assigned_time']) else None,
-        axis=1
-    )
+    #dataframe['cycle_time'] = dataframe.apply(
+        #lambda row: (row['closed_at'] - row['assigned_time']).total_seconds() / 3600
+        #if pd.notnull(row['closed_at']) and pd.notnull(row['assigned_time']) else None,
+        #axis=1
+    #)
+    dataframe['cycle_time'] = None
     return dataframe
 
 
diff --git a/Backend/dataCollection/pullRequest.py b/Backend/dataCollection/pullRequest.py
@@ -1,6 +1,6 @@
 from github import Github
 import pandas as pd
-from dataCollection.sprintFiltering import filter_data_by_sprint
+from Backend.dataCollection.sprintFiltering import filter_data_by_sprint
 
 def get_pr_data(g, repo_name, sprint = -1):
     """
diff --git a/data/sprint_schedule.json b/data/sprint_schedule.json
@@ -0,0 +1,14 @@
+{
+  "Spring_2026": [
+    {
+      "sprint": 1,
+      "start": "2026-01-20",
+      "end": "2026-02-02"
+    },
+    {
+      "sprint": 2,
+      "start": "2026-02-03",
+      "end": "2026-02-16"
+    }
+  ]
+}