Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 91 additions & 98 deletions .github/workflows/broken-link-checker.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
name: Check for Broken Links
on: [push, pull_request]
on:
pull_request:
push:
branches:
- main
- master
- develop

jobs:
build_and_check:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -33,115 +40,101 @@ jobs:
TEMPORARY_WEBSITE_URL: 'http://127.0.0.1:8080'
ACTUAL_WEBSITE_URL: 'https://ddmal.ca/Neon/'
run: |
# Function to retry URLs with retryable errors
retry_urls() {
local urls="$1"
while IFS= read -r url; do
[ -z "$url" ] && continue
echo "🔄 Retrying: $url"

for attempt in 1 2 3; do
echo " Attempt $attempt/3..."
http_code=$(curl -L -s -o /dev/null -w "%{http_code}" \
-H "User-Agent: Mozilla/5.0 (compatible; BrokenLinkChecker)" \
--connect-timeout 30 --max-time 60 "$url" 2>/dev/null)

if echo "$http_code" | grep -E "^(200|301|302|303)$" > /dev/null; then
echo " ✅ Success! HTTP $http_code"
echo "RETRY_SUCCESS:$url" >> /tmp/retry_results
break
elif [ $attempt -eq 3 ]; then
echo " ❌ Failed after 3 attempts (HTTP $http_code)"
echo "RETRY_FAILED:$url" >> /tmp/retry_results
else
echo " ⏳ Failed with HTTP $http_code, retrying in 5 seconds..."
sleep 5
fi
done
echo ""
done <<< "$urls"
}

# Initialize retry results file
> /tmp/retry_results

# Run broken link checker and filter output
echo "Running broken link check..."
output=$(blc $TEMPORARY_WEBSITE_URL --filter-level=3 | \
grep -v -E '├───OK───|└───OK───' | \
awk '
BEGIN { buf="" }
/^Getting links from:/ { buf=$0; next }
/^Finished!.*0 broken\./ {
if (length(buf)>0) { buf=""; next }
}
{
if(length(buf)>0) print buf
if (NF > 0) print
buf=""
}
/^Finished!/ { print "" }
' | sed "s|$TEMPORARY_WEBSITE_URL|$ACTUAL_WEBSITE_URL|g")

echo "Initial link check results:"
echo "$output"

# Handle retryable errors
retryable_urls=$(echo "$output" | grep -E "(BLC_UNKNOWN|HTTP_429)" | \
sed -n 's/.*├─BROKEN─ \(https\?:\/\/[^[:space:]]*\).*/\1/p')

if [ -n "$retryable_urls" ]; then
echo ""
echo "🔄 Found URLs with retryable errors, starting retry process..."
retry_urls "$retryable_urls"
echo "Running broken link check with rate limiting..."

# Run blc with CLI options to avoid rate limiting
# --filter-level 3: Check all link types including metadata
# --ordered: Check links sequentially (helps avoid rate limiting)
# --get: Use GET requests instead of HEAD (more compatible)
# --user-agent: Use realistic browser user agent
# --host-requests 1: Limit to 1 concurrent request per host (key for avoiding 429)
set +e # Don't exit on blc failure, we'll handle it
blc $TEMPORARY_WEBSITE_URL \
--filter-level 3 \
--ordered \
--get \
--user-agent "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" \
--host-requests 1 \
--recursive \
--verbose \
> /tmp/blc_output.txt 2>&1
Comment thread
cursor[bot] marked this conversation as resolved.
blc_exit_code=$?
set -e
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Captured blc exit code is never checked

The comment on line 51 says "Don't exit on blc failure, we'll handle it" and blc_exit_code is captured on line 61, but this variable is never actually used. If blc completely fails to run (crashes, network issues, etc.), the output file will be empty or contain only error messages, the grep for broken links will find nothing, and the workflow will incorrectly report success ("All links verified successfully") even though the link check never completed.

Fix in Cursor Fix in Web


# Display the output
cat /tmp/blc_output.txt

# Get all broken links
all_broken_links=$(grep -E "├─BROKEN─" /tmp/blc_output.txt || true)

# Show retry summary
success_count=$(grep -c "^RETRY_SUCCESS:" /tmp/retry_results 2>/dev/null || echo "0")
failed_count=$(grep -c "^RETRY_FAILED:" /tmp/retry_results 2>/dev/null || echo "0")
echo "📊 Retry Summary: $success_count succeeded, $failed_count failed"
echo ""
echo "=== Broken Links Found by blc ==="
if [ -n "$all_broken_links" ]; then
echo "$all_broken_links"
else
echo "None"
fi

# Determine final status
has_errors=false
# Function to verify links with curl
verify_with_curl() {
local url="$1"
echo " 🔄 Verifying: $url"

# Use temp file instead of /dev/null to avoid truncation errors on retry
temp_body=$(mktemp)

http_code=$(curl -L -s -o "$temp_body" -w "%{http_code}" \
-H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" \
--connect-timeout 30 --max-time 60 --insecure \
--retry 3 --retry-delay 5 --retry-all-errors \
"$url" 2>/dev/null || echo "000")

rm -f "$temp_body"

if echo "$http_code" | grep -E "^(200|301|302|303)$" > /dev/null; then
echo " ✅ Success: HTTP $http_code"
return 0
elif [ "$http_code" = "429" ]; then
echo " ⚠️ HTTP 429 (rate limited)"
rate_limited_urls="${rate_limited_urls}${url}\n"
return 0
else
echo " ❌ Failed: HTTP $http_code"
failed_http_code="$http_code"
return 1
fi
}

# Check for 4xx errors not resolved by retries
if echo "$output" | grep -Eq 'HTTP_4[0-9]{2}'; then
successful_urls=$(grep "^RETRY_SUCCESS:" /tmp/retry_results 2>/dev/null | cut -d: -f2- || echo "")
# Verify all broken links with curl
verified_failures=""
rate_limited_urls=""

unresolved_4xx=$(echo "$output" | grep 'HTTP_4[0-9]{2}' | while read -r line; do
url=$(echo "$line" | sed -n 's/.*├─BROKEN─ \(https\?:\/\/[^[:space:]]*\).*/\1/p')
if [ -n "$url" ] && ! echo "$successful_urls" | grep -Fxq "$url"; then
echo "$line"
fi
done)
if [ -n "$all_broken_links" ]; then
echo ""
echo "=== Verifying Links with curl ==="

if [ -n "$unresolved_4xx" ]; then
echo ""
echo "❌ Unresolved HTTP 4xx errors:"
echo "$unresolved_4xx"
has_errors=true
fi
# Extract URLs and verify them
urls_to_verify=$(echo "$all_broken_links" | sed -n 's/.*├─BROKEN─ \(https\?:\/\/[^[:space:]]*\).*/\1/p')

# Check for failed retries
if grep -q "^RETRY_FAILED:" /tmp/retry_results 2>/dev/null; then
echo ""
echo "❌ URLs that failed after retries:"
grep "^RETRY_FAILED:" /tmp/retry_results | cut -d: -f2-
has_errors=true
fi
while IFS= read -r url; do
[ -z "$url" ] && continue
if ! verify_with_curl "$url"; then
verified_failures="${verified_failures}${url} (HTTP ${failed_http_code})\n"
fi
done <<< "$urls_to_verify"
fi

# Final result
# Final results
echo ""
if [ "$has_errors" = true ]; then
echo "❌ Broken links found that could not be resolved."
if [ -n "$verified_failures" ]; then
echo "❌ CI Failed: The following links failed:"
echo -e "$verified_failures"
exit 1
else
if grep -q "^RETRY_SUCCESS:" /tmp/retry_results 2>/dev/null; then
echo "✅ All broken links resolved via retries! Successfully fixed:"
grep "^RETRY_SUCCESS:" /tmp/retry_results | cut -d: -f2- | sed 's/^/ - /'
else
echo "✅ No broken links found."
if [ -n "$rate_limited_urls" ]; then
echo "⚠️ Note: These links returned HTTP 429 (rate limited, not broken):"
echo -e "$rate_limited_urls"
fi
echo "✅ CI Passed: All links verified successfully"
exit 0
fi
2 changes: 1 addition & 1 deletion .github/workflows/cypress_prod.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: Scheduled E2E on Chrome
on:
schedule:
- cron: '0 7 * * *' # Runs at 07:00 UTC every day (02:00 AM EST in winter, 03:00 AM EST in summer)
- cron: '0 7 * * 0' # Runs every Sunday at 02:00 Montreal time (EST) / 03:00 Montreal time (EDT)
jobs:
cypress-run:
runs-on: ubuntu-latest
Expand Down
Loading