Skip to content

Commit 8ab19b4

Browse files
committed
Refactor data collection into lifetime and sprint workflows
1 parent f23fb6f commit 8ab19b4

8 files changed

Lines changed: 235 additions & 87 deletions

File tree

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Runs twice a month to collect fill repository history
2+
# This updates the long-term health data used by the dashboard
3+
4+
name: Lifetime Data Collection
5+
6+
on:
7+
schedule:
8+
# Runs at midnight on the 1st and 15th of every month
9+
- cron: "0 0 1,15 * *"
10+
workflow_dispatch:
11+
12+
jobs:
13+
collect-lifetime-data:
14+
runs-on: ubuntu-latest
15+
16+
steps:
17+
# Checkout the Data_Updates branch (never push to main)
18+
- name: Checkout repository
19+
uses: actions/checkout@v4
20+
with:
21+
ref: Data_Updates
22+
23+
# Set up Python environment
24+
- name: Set up Python
25+
uses: actions/setup-python@v5
26+
with:
27+
python-version: "3.10"
28+
29+
# Install project dependencies
30+
- name: Install dependencies
31+
run: pip install -r requirements.txt
32+
33+
# Run data collection in lifetime mode
34+
- name: Run lifetime data collection
35+
run: python Backend/dataCollection/collectData.py --mode lifetime
36+
37+
# Commit and push updated JSON file
38+
- name: Commit and push changes
39+
run: |
40+
git config user.name "github-actions"
41+
git config user.email "actions@github.com"
42+
git add data/lifetime_data.json
43+
git commit -m "Update lifetime repository data"
44+
git push origin Data_Updates

.github/workflows/sprint_data.yml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Runs during active sprints
2+
# Collects sprint-level data for the Team Stats page
3+
4+
name: Sprint Data Collection
5+
6+
on:
7+
schedule:
8+
# Runs every Monday at midnight
9+
# The script itself checks if a sprint is active
10+
- cron: "0 0 * *. 1"
11+
workflow_dispatch:
12+
13+
jobs:
14+
collect-sprint-data:
15+
runs-on: ubuntu-latest
16+
17+
steps:
18+
# Checkout the Data_Updates branch (never push to main)
19+
- name: Checkout repository
20+
uses: actions/checkout@v4
21+
with:
22+
ref: Data_Updates
23+
24+
# Set up Python environment
25+
- name: Set up Python
26+
uses: actions/setup-python@v5
27+
with:
28+
python-version: "3.10"
29+
30+
# Install project dependencies
31+
- name: Install dependencies
32+
run: pip install -r requirements.txt
33+
34+
# Run data collection in sprint mode
35+
- name: Run sprint data collection
36+
run: python Backend/dataCollection/collectData.py --mode lifetime
37+
38+
# Commit and push updated JSOn file
39+
- name: Commit and push changes
40+
run: |
41+
git config user.name "github-actions"
42+
git config user.email "actions@github.com"
43+
git add data/sprint_data.json
44+
git commit -m "Update sprint data"
45+
git push origin Data_Updates

Backend/config/configs.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
1-
from configparser import ConfigParser
21
import os
32
from dotenv import load_dotenv
43
from pathlib import Path
54

6-
# having issues finding the .env file when doing local testing, according to my research this should not impact the actions
7-
env_path = Path(__file__).parent / ".env"
8-
#Getting API Token to be used in other files
9-
load_dotenv()
10-
GIT_TOKEN = os.getenv('GIT_TOKEN')
5+
# Explicitly load .env from project root
6+
project_root = Path(__file__).resolve().parent.parent.parent
7+
load_dotenv(project_root / ".env")
118

12-
if not GIT_TOKEN:
13-
raise ValueError("GIT_TOKEN not found! Please set it in .env file")
9+
GIT_TOKEN = os.getenv("GIT_TOKEN")
1410

11+
if not GIT_TOKEN:
12+
raise ValueError("GIT_TOKEN not found! Please set it in .env file")
Lines changed: 106 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,83 +1,123 @@
11
import os
2-
from github import Github, Auth
3-
from dataCollection.formatJSON import format_json_data
4-
from config.configs import GIT_TOKEN
5-
from dataCollection.pullRequest import get_pr_data
6-
from dataCollection.issue import get_issue_data
7-
from dataCollection.commit import get_commit_data
8-
import pandas as pd
92
import json
3+
import argparse
4+
from datetime import datetime
5+
from github import Github, Auth
6+
from Backend.config.configs import GIT_TOKEN
7+
from Backend.dataCollection.formatJSON import format_json_data
8+
from Backend.dataCollection.pullRequest import get_pr_data
9+
from Backend.dataCollection.issue import get_issue_data
10+
from Backend.dataCollection.commit import get_commit_data
1011

11-
def test_pr_data():
12-
repo_name = "oss-slu/oss_dev_analytics"
13-
g = Github(auth=Auth.Token(GIT_TOKEN))
14-
sprint = 7
15-
pr_data = get_pr_data(g, repo_name, sprint)
16-
print(pr_data.head())
17-
return pr_data
18-
def test_issue_data():
19-
repo_name = "oss-slu/oss_dev_analytics"
20-
g = Github(auth=Auth.Token(GIT_TOKEN))
21-
sprint = 7
22-
issue_data = get_issue_data(g, repo_name, sprint)
23-
with pd.option_context('display.max_rows', None, 'display.max_columns', None): #just want to make sure the df looks correct
24-
print(issue_data)
25-
return issue_data
26-
def test_commit_data():
27-
repo_name = "oss-slu/oss_dev_analytics"
28-
g = Github(auth=Auth.Token(GIT_TOKEN))
29-
sprint = 7
30-
commit_data = get_commit_data(g, repo_name, sprint)
31-
with pd.option_context('display.max_rows', None, 'display.max_columns', None): #just want to make sure the df looks correct
32-
print(commit_data)
33-
return commit_data
3412

35-
def dev_analytics_test():
36-
repo_name = "oss-slu/oss_dev_analytics"
37-
g = Github(auth=Auth.Token(GIT_TOKEN))
38-
sprint = 7
39-
issue_data = get_issue_data(g, repo_name, sprint)
40-
pr_data = get_pr_data(g, repo_name, sprint)
41-
commit_data = get_commit_data(g, repo_name, sprint)
42-
out = {
43-
"issues": issue_data.to_dict(orient='records'),
44-
"pull_requests": pr_data.to_dict(orient='records'),
45-
"commits": commit_data.to_dict(orient='records')
46-
}
47-
#now send the temp .json to a function that puts it into the format we need to show analytics
48-
formatted = format_json_data(out, sprint)
49-
with open("test_data.json", "w") as outfile:
50-
json.dump(formatted, outfile, indent=4, default=str)
51-
def other_repo(repo, sprint = -1):
52-
repo_name = "oss-slu/"+repo
13+
# This function checks which sprint is currently active
14+
# It reads the sprint_schedule_json file and compares
15+
# today's date with the start and end dates
16+
def get_current_sprint():
17+
try:
18+
with open("data/sprint_schedule.json", "r") as file:
19+
schedule = json.load(file)
20+
21+
today = datetime.today().date()
22+
23+
for semester in schedule:
24+
for sprint in schedule[semester]:
25+
start = datetime.strptime(
26+
sprint["start"], "%Y-%m-%d"
27+
).date()
28+
end = datetime.strptime(
29+
sprint["end"], "%Y-%m-%d"
30+
).date()
31+
32+
# If today is inside the sprint window,
33+
# return that sprint number
34+
if start <= today <= end:
35+
return sprint["sprint"]
36+
37+
# If no sprint matches, return None
38+
return None
39+
40+
except Exception as e:
41+
print(f"Error reading sprint schedule: {e}")
42+
return None
43+
44+
# Collect data for one repository
45+
# sprint = -1 means lifetime mode (full history)
46+
# Any other sprint number means sprint-only data
47+
def other_repo(repo, sprint=-1):
48+
repo_name = "oss-slu/" + repo
5349
g = Github(auth=Auth.Token(GIT_TOKEN))
50+
51+
# Get issue, PR, and commit data
5452
issue_data = get_issue_data(g, repo_name, sprint)
5553
pr_data = get_pr_data(g, repo_name, sprint)
5654
commit_data = get_commit_data(g, repo_name, sprint)
57-
out = {
58-
"issues": issue_data.to_dict(orient='records'),
59-
"pull_requests": pr_data.to_dict(orient='records'),
60-
"commits": commit_data.to_dict(orient='records')
55+
56+
# Converting DataFrame into dictionary format
57+
out = {
58+
"issues": issue_data.to_dict(orient="records"),
59+
"pull_requests": pr_data.to_dict(orient="records"),
60+
"commits": commit_data.to_dict(orient="records")
6161
}
62+
63+
# Format data into the structure the dashboard expects
6264
formatted = format_json_data(out, sprint)
63-
path = "test_data.json"
64-
#Using one org wide json file rather than a bunch of individual ones
65-
#First need to check if it exists otherwise we are creating it for the first time
66-
if (not os.path.exists(path)):
65+
66+
# Deciding which file to write to
67+
# Lifetime data goes into lifetime_data.json
68+
# Sprint data goes into sprint_data.json
69+
path = (
70+
"data/lifetime_data.json"
71+
if sprint == -1
72+
else "data/sprint_data.json"
73+
)
74+
75+
# Making sure the data folder exists
76+
os.makedirs("data", exist_ok=True)
77+
78+
# If file does not exist yet, create it
79+
if not os.path.exists(path):
6780
with open(path, "w") as outfile:
6881
json.dump({}, outfile)
69-
data = {}
70-
else:
71-
with open(path, "r") as outfile:
72-
data = json.load(outfile)
73-
#now it is created or loaded, we can add new repo data. Keys are the repo (for lifetime data) or repo_sprint_X for specific sprint data
82+
83+
# Loading existing data
84+
with open(path, "r") as outfile:
85+
data = json.load(outfile)
86+
87+
# Using repo name as key for lifetime data
88+
# For sprint mode, include sprint number in the key
7489
key = repo if sprint == -1 else f"{repo}_sprint_{sprint}"
7590
data[key] = formatted
91+
92+
# Writing updated data back to file
7693
with open(path, "w") as outfile:
7794
json.dump(data, outfile, indent=4, default=str)
95+
96+
# This is the entry point when the workflow runs the script
97+
# The workflow passes either --mode lifetime or --mode sprint
7898
if __name__ == "__main__":
79-
#dev_analytics_test()
80-
other_repo("lrda_mobile", -1)
81-
other_repo("oss_dev_analytics", -1)
82-
other_repo("lrda_mobile", 7)
83-
other_repo("oss_dev_analytics", 7)
99+
parser = argparse.ArgumentParser()
100+
parser.add_argument(
101+
"--mode",
102+
choices=["lifetime", "sprint"],
103+
required=True
104+
)
105+
args = parser.parse_args()
106+
107+
repos = ["lrda_mobile", "oss_dev_analytics"]
108+
109+
if args.mode == "lifetime":
110+
# Run fill history collection
111+
for repo in repos:
112+
other_repo(repo, -1)
113+
114+
elif args.mode == "sprint":
115+
# First check which sprint is active
116+
sprint = get_current_sprint()
117+
118+
if sprint is None:
119+
# If we are outside sprint dates, do nothing
120+
print("No active sprint found")
121+
else:
122+
for repo in repos:
123+
other_repo(repo, sprint)

Backend/dataCollection/commit.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import pandas as pd
2-
from dataCollection.sprintFiltering import filter_data_by_sprint
2+
from Backend.dataCollection.sprintFiltering import filter_data_by_sprint
33

44
def get_commit_data(g, repo_name, sprint = -1):
55
"""

Backend/dataCollection/issue.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import pandas as pd
2-
from dataCollection.sprintFiltering import filter_data_by_sprint
2+
from Backend.dataCollection.sprintFiltering import filter_data_by_sprint
33

44

55
def get_issue_data(g, repo_name, sprint = -1):
@@ -34,7 +34,11 @@ def get_issue_data(g, repo_name, sprint = -1):
3434

3535
print(f"Issue Repository: {repo.name}") #debugging only
3636
issues = repo.get_issues(state="all", sort="created", direction="desc")
37-
issues_filtered = filter_data_by_sprint(issues, sprint)
37+
#issues_filtered = filter_data_by_sprint(issues, sprint)
38+
if sprint == -1:
39+
issues_filtered = issues
40+
else:
41+
issues_filtered = filter_data_by_sprint(issues, sprint)
3842
for issue in issues_filtered:
3943
try:
4044
# Skip pull requests
@@ -49,11 +53,13 @@ def get_issue_data(g, repo_name, sprint = -1):
4953
time_to_close = (closed_at - created_at).total_seconds() / 3600
5054
else:
5155
time_to_close = None
56+
assigned_time = None
57+
5258
#Get info for Cycle time calculation
53-
for event in issue.get_events():
54-
if event.event == "assigned":
55-
assigned_time = event.created_at
56-
break
59+
# for event in issue.get_events():
60+
# if event.event == "assigned":
61+
# assigned_time = event.created_at
62+
# break
5763

5864
issue_records.append({
5965
'repository': repo.name,
@@ -81,11 +87,12 @@ def get_issue_data(g, repo_name, sprint = -1):
8187
dataframe['issues_opened'] = issues_opened
8288
dataframe['issues_closed'] = issues_closed
8389
dataframe['wip_issues'] = wip_issues
84-
dataframe['cycle_time'] = dataframe.apply(
85-
lambda row: (row['closed_at'] - row['assigned_time']).total_seconds() / 3600
86-
if pd.notnull(row['closed_at']) and pd.notnull(row['assigned_time']) else None,
87-
axis=1
88-
)
90+
#dataframe['cycle_time'] = dataframe.apply(
91+
#lambda row: (row['closed_at'] - row['assigned_time']).total_seconds() / 3600
92+
#if pd.notnull(row['closed_at']) and pd.notnull(row['assigned_time']) else None,
93+
#axis=1
94+
#)
95+
dataframe['cycle_time'] = None
8996
return dataframe
9097

9198

Backend/dataCollection/pullRequest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from github import Github
22
import pandas as pd
3-
from dataCollection.sprintFiltering import filter_data_by_sprint
3+
from Backend.dataCollection.sprintFiltering import filter_data_by_sprint
44

55
def get_pr_data(g, repo_name, sprint = -1):
66
"""

data/sprint_schedule.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"Spring_2026": [
3+
{
4+
"sprint": 1,
5+
"start": "2026-01-20",
6+
"end": "2026-02-02"
7+
},
8+
{
9+
"sprint": 2,
10+
"start": "2026-02-03",
11+
"end": "2026-02-16"
12+
}
13+
]
14+
}

0 commit comments

Comments
 (0)