From 6b522ce4ef1701345e19f82141451da4055f3eaf Mon Sep 17 00:00:00 2001 From: Joe Olson Date: Fri, 8 Aug 2025 10:45:29 -0500 Subject: [PATCH 1/2] Closes #292 Signed-off-by: Joe Olson --- .github/tools/collect_metrics.py | 714 ++++++++++++++++++++++++++ .github/workflows/collect_metrics.yml | 106 ++++ 2 files changed, 820 insertions(+) create mode 100644 .github/tools/collect_metrics.py create mode 100644 .github/workflows/collect_metrics.yml diff --git a/.github/tools/collect_metrics.py b/.github/tools/collect_metrics.py new file mode 100644 index 000000000..52d733a16 --- /dev/null +++ b/.github/tools/collect_metrics.py @@ -0,0 +1,714 @@ +"""Collects various metrics from a GitHub repository using the GitHub API. + +Copyright 2025 +SPDX-License-Identifier: Apache-2.0 +Authors: Trevor Grant + +Metrics include: +- Standard repository attributes (stars, forks, issues, etc.) +- New forks and contributors in the last period +- Traffic data (views, clones, referrers, paths) +- Issues and pull requests opened/closed in the last period +- Comments on issues and pull requests in the last period +- Discussions metrics via GraphQL +Outputs the collected metrics to a Parquet file. + +This is uploaded to the GitHub AI Alliance repository for metrics collection. +""" + +import os +import sys +import pandas as pd +from github import ( + Github, + GithubException, + UnknownObjectException, +) # Added UnknownObjectException +from datetime import datetime, timedelta, timezone # Added timezone +import time +import requests +import json + + +# --- GraphQL Helper --- +def run_graphql_query(token, query, variables=None): + """Runs a GraphQL query against the GitHub API.""" + graphql_url = "https://api.github.com/graphql" + headers = { + "Authorization": f"bearer {token}", + "Content-Type": "application/json", + } + payload = {"query": query} + if variables: + payload["variables"] = variables + + try: + response = requests.post( + graphql_url, headers=headers, json=payload, timeout=30 + ) # Added timeout + response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) + + json_response = response.json() + + # Check for GraphQL-specific errors + if "errors" in json_response: + error_details = json.dumps(json_response["errors"], indent=2) + # Consider logging this instead of printing if it gets noisy + print(f"GraphQL query failed with errors:\n{error_details}") + # Decide how to handle: return None, raise exception, etc. + # Returning None might be suitable for metrics collection + return None + + return json_response.get("data") + + except requests.exceptions.RequestException as e: + print(f"Error during GraphQL request: {e}") + return None + except json.JSONDecodeError as e: + print(f"Error decoding GraphQL JSON response: {e}") + return None + + +# --- Configuration --- +token = os.getenv("GITHUB_TOKEN") +repo_name = os.getenv("GITHUB_REPOSITORY") # Format: 'owner/repo' +lookback_days = 1 # Define the period for "new" items (e.g., last 1 day) +output_filename = f"github_metrics_{datetime.now().strftime('%Y%m%d_%H%M%S')}.parquet" +max_retries = 3 # Retries for API calls that might need time + +# --- Input Validation --- +if not token: + print("Error: GITHUB_TOKEN environment variable not set.") + sys.exit(1) +if not repo_name: + print("Error: GITHUB_REPOSITORY environment variable not set.") + sys.exit(1) + +print(f"Starting metrics collection for repository: {repo_name}") +print(f"Lookback period for 'new' items: {lookback_days} day(s)") + +# --- GitHub API Connection --- +try: + g = Github(token, retry=5, timeout=15) # Added retry and timeout to Github object + repo = g.get_repo(repo_name) + print("Successfully connected to GitHub API.") +except GithubException as e: + print(f"Error connecting to GitHub API or getting repository: {e}") + sys.exit(1) + +# --- Define Cutoff Time (UTC) --- +# Use timezone-aware datetime object for 'since' parameter +cutoff_datetime_aware = datetime.now(timezone.utc) - timedelta(days=lookback_days) +# Use naive datetime for simple comparisons if needed (e.g., fork creation) +cutoff_datetime_naive = datetime.utcnow() - timedelta(days=lookback_days) +print(f"Calculating 'new' items since: {cutoff_datetime_aware}") + + +# --- Data Collection --- +metrics = {} +metrics["timestamp_utc"] = datetime.now(timezone.utc) # Store timezone-aware timestamp +metrics["repository_name"] = repo.full_name + +print("\nFetching standard repository metrics...") +try: + # Standard Attributes + metrics["stars"] = repo.stargazers_count + metrics["watchers"] = repo.subscribers_count + metrics["forks_total"] = repo.forks_count # Renamed for clarity + metrics["open_issues_total"] = repo.open_issues_count # Renamed for clarity + metrics["network_count"] = repo.network_count + metrics["size_kb"] = repo.size + metrics["language"] = repo.language + metrics["created_at_utc"] = repo.created_at + metrics["pushed_at_utc"] = repo.pushed_at + metrics["archived"] = repo.archived + metrics["disabled"] = repo.disabled + metrics["has_issues"] = repo.has_issues + metrics["has_projects"] = repo.has_projects + metrics["has_wiki"] = repo.has_wiki + metrics["has_pages"] = repo.has_pages + metrics["has_downloads"] = repo.has_downloads + metrics["has_discussions"] = repo.has_discussions # Added check for discussions + metrics["license"] = repo.get_license().license.spdx_id if repo.license else None + + # List Counts + metrics["contributors_count_total"] = repo.get_contributors().totalCount + metrics["releases_count_total"] = repo.get_releases().totalCount + +except GithubException as e: + print(f"Warning: Could not fetch some standard metrics: {e}") + # Initialize potentially missed metrics + standard_keys = [ + "stars", + "watchers", + "forks_total", + "open_issues_total", + "network_count", + "size_kb", + "language", + "created_at_utc", + "pushed_at_utc", + "archived", + "disabled", + "has_issues", + "has_projects", + "has_wiki", + "has_pages", + "has_downloads", + "has_discussions", + "license", + "contributors_count_total", + "releases_count_total", + ] + for key in standard_keys: + if key not in metrics: + metrics[key] = None + + +# --- Calculated Metrics: "New" Forks (Last Period) --- +print(f"\nCalculating new forks in the last {lookback_days} day(s)...") +new_forks_count = 0 +try: + # Iterate through forks - PaginatedList handles pagination + for fork in repo.get_forks(): + # Compare naive UTC datetimes + if fork.created_at.replace(tzinfo=None) >= cutoff_datetime_naive: + new_forks_count += 1 + else: + # Forks are sorted newest first, so we can stop early + break + metrics["forks_new_last_period"] = new_forks_count + print(f"Found {new_forks_count} new forks.") +except GithubException as e: + print(f"Warning: Could not calculate new forks: {e}") + metrics["forks_new_last_period"] = None + + +# --- Calculated Metrics: "New" Contributors (Approximation using stats) --- +# Stays the same as the previously corrected version - using weekly stats approximation +print(f"\nCalculating recent contributor additions (weekly stats)...") +recent_contributor_adds = 0 +retries = 0 +stats_contributors = None +while retries < max_retries: + try: + stats_contributors = repo.get_stats_contributors() + if stats_contributors is not None: + break + else: + print( + f"Contributor stats not available yet (Attempt {retries+1}/{max_retries}). Waiting 30 seconds..." + ) + time.sleep(30) + retries += 1 + except GithubException as e: + if e.status == 202: + print( + f"Contributor stats computing (Attempt {retries+1}/{max_retries}). Waiting 30 seconds..." + ) + time.sleep(30) + retries += 1 + elif e.status == 404: + print(f"Warning: Contributor stats API 404: {e}") + stats_contributors = [] # Treat as empty + break + else: + print(f"Warning: Could not fetch contributor stats: {e}") + stats_contributors = None + break + +if stats_contributors is not None: + cutoff_date_stats_naive = ( + cutoff_datetime_naive # Use naive for comparison consistency + ) + print( + f"Cutoff date for contributor stats: {cutoff_date_stats_naive} UTC (comparing week start)" + ) + for stat in stats_contributors: + if not hasattr(stat, "weeks") or not stat.weeks: + continue + for week_stat in stat.weeks: + if not hasattr(week_stat, "w"): + continue + week_start_time = week_stat.w + if isinstance(week_start_time, datetime): + naive_week_start = week_start_time.replace(tzinfo=None) + if naive_week_start >= cutoff_date_stats_naive: + if hasattr(week_stat, "a") and isinstance(week_stat.a, int): + recent_contributor_adds += week_stat.a + else: + print( + f"Warning: Unexpected type for week_stat.w: {type(week_start_time)}. Skipping." + ) + metrics["contributors_additions_recent_weeks"] = ( + recent_contributor_adds # Changed name slightly + ) + print( + f"Found {recent_contributor_adds} contributor additions (approx) based on recent weekly stats." + ) +else: + metrics["contributors_additions_recent_weeks"] = None + if retries == max_retries: + print("Warning: Contributor stats could not be retrieved.") + + +# --- Traffic Data (Last Day if available) --- +print("\nFetching traffic data (views and clones)...") + +# Define the target date (yesterday in UTC) +target_traffic_date_naive = (datetime.utcnow() - timedelta(days=1)).date() +print(f"Targeting traffic data for date: {target_traffic_date_naive}") + +# Initialize metrics for the target date +metrics["traffic_views_last_day_total"] = None +metrics["traffic_views_last_day_unique"] = None +metrics["traffic_clones_last_day_total"] = None +metrics["traffic_clones_last_day_unique"] = None + +# --- Process Views --- +found_view_data_for_target = False +try: + views_traffic_obj = repo.get_views_traffic(per="day") + # Check if the object and the 'views' list exist and are not empty + if ( + views_traffic_obj + and hasattr(views_traffic_obj, "views") + and views_traffic_obj.views + ): + # Iterate through the list of daily view data + for view_entry in views_traffic_obj.views: + # Check if the entry has a timestamp and it's a datetime object + if hasattr(view_entry, "timestamp") and isinstance( + view_entry.timestamp, datetime + ): + # Compare the date part of the timestamp with our target date + if view_entry.timestamp.date() == target_traffic_date_naive: + # Found the data for yesterday! + metrics["traffic_views_last_day_total"] = getattr( + view_entry, "count", None + ) # Use getattr for safety + metrics["traffic_views_last_day_unique"] = getattr( + view_entry, "uniques", None + ) + print( + f"Found views for {target_traffic_date_naive}: Total={metrics['traffic_views_last_day_total']}, Unique={metrics['traffic_views_last_day_unique']}" + ) + found_view_data_for_target = True + break # Stop searching once found + else: + print( + f"Warning: Skipping view entry with missing or invalid timestamp: {view_entry}" + ) + + # After checking all entries, if we didn't find the target date + if not found_view_data_for_target: + print( + f"Warning: No view data found specifically for target date {target_traffic_date_naive}. Metrics remain None." + ) + # metrics['traffic_views_last_day_total'] = 0 # Alternative: default to 0 if preferred + # metrics['traffic_views_last_day_unique'] = 0 + + else: + print( + "Warning: No daily view data list available or attribute 'views' missing." + ) + # Metrics remain None as initialized + +except GithubException as e: + print(f"Warning: Could not fetch views traffic data: {e}") + # Metrics remain None +except Exception as e: # Catch other potential errors during processing + print(f"Warning: Error processing view data: {e}") + # Metrics remain None + + +# --- Process Clones --- +found_clone_data_for_target = False +try: + clones_traffic_obj = repo.get_clones_traffic(per="day") + # Check if the object and the 'clones' list exist and are not empty + if ( + clones_traffic_obj + and hasattr(clones_traffic_obj, "clones") + and clones_traffic_obj.clones + ): + # Iterate through the list of daily clone data + for clone_entry in clones_traffic_obj.clones: + # Check if the entry has a timestamp and it's a datetime object + if hasattr(clone_entry, "timestamp") and isinstance( + clone_entry.timestamp, datetime + ): + # Compare the date part of the timestamp with our target date + if clone_entry.timestamp.date() == target_traffic_date_naive: + # Found the data for yesterday! + metrics["traffic_clones_last_day_total"] = getattr( + clone_entry, "count", None + ) # Use getattr for safety + metrics["traffic_clones_last_day_unique"] = getattr( + clone_entry, "uniques", None + ) + print( + f"Found clones for {target_traffic_date_naive}: Total={metrics['traffic_clones_last_day_total']}, Unique={metrics['traffic_clones_last_day_unique']}" + ) + found_clone_data_for_target = True + break # Stop searching once found + else: + print( + f"Warning: Skipping clone entry with missing or invalid timestamp: {clone_entry}" + ) + + # After checking all entries, if we didn't find the target date + if not found_clone_data_for_target: + print( + f"Warning: No clone data found specifically for target date {target_traffic_date_naive}. Metrics remain None." + ) + # metrics['traffic_clones_last_day_total'] = 0 # Alternative: default to 0 + # metrics['traffic_clones_last_day_unique'] = 0 + + else: + print( + "Warning: No daily clone data list available or attribute 'clones' missing." + ) + # Metrics remain None as initialized + +except GithubException as e: + print(f"Warning: Could not fetch clones traffic data: {e}") + # Metrics remain None +except Exception as e: # Catch other potential errors during processing + print(f"Warning: Error processing clone data: {e}") + # Metrics remain None + + +# --- Referrers and Popular Content Data (Last 14 days) --- +print("\nFetching top referrers data (last 14 days)...") +top_referrers_data = [] # Initialize empty list +try: + top_referrers_list = repo.get_top_referrers() + # Iterate through the PaginatedList of Referrer objects + for r in top_referrers_list: + # Extract relevant data into a dictionary + # Use getattr for safety in case attributes are missing unexpectedly + referrer_dict = { + "referrer": getattr(r, "referrer", None), + "count": getattr(r, "count", None), + "uniques": getattr(r, "uniques", None), + } + top_referrers_data.append(referrer_dict) + + metrics["traffic_top_referrers_data"] = ( + top_referrers_data # Store the list of dicts + ) + print(f"Fetched {len(top_referrers_data)} top referrer entries.") + +except GithubException as e: + print(f"Warning: Could not fetch top referrers: {e}") + metrics["traffic_top_referrers_data"] = None # Set to None on API error +except Exception as e: # Catch potential errors during processing the list + print(f"Warning: Error processing referrer data: {e}") + metrics["traffic_top_referrers_data"] = None + + +print("\nFetching top paths data (last 14 days)...") +top_paths_data = [] # Initialize empty list +try: + top_paths_list = repo.get_top_paths() + # Iterate through the PaginatedList of Path objects + for p in top_paths_list: + # Extract relevant data into a dictionary + path_dict = { + "path": getattr(p, "path", None), + "title": getattr(p, "title", None), # Title might not always exist + "count": getattr(p, "count", None), + "uniques": getattr(p, "uniques", None), + } + top_paths_data.append(path_dict) + + metrics["traffic_top_paths_data"] = top_paths_data # Store the list of dicts + print(f"Fetched {len(top_paths_data)} top path entries.") + +except GithubException as e: + print(f"Warning: Could not fetch top paths: {e}") + metrics["traffic_top_paths_data"] = None # Set to None on API error +except Exception as e: # Catch potential errors during processing the list + print(f"Warning: Error processing path data: {e}") + metrics["traffic_top_paths_data"] = None + + +# --- Issues Opened/Closed Last Period --- +print(f"\nCalculating Issues opened/closed in the last {lookback_days} day(s)...") +issues_opened_count = 0 +issues_closed_count = 0 +issues_comments_count = 0 +try: + # Issues opened: Use 'since' which filters by creation time + opened_issues = repo.get_issues( + state="all", sort="created", since=cutoff_datetime_aware + ) + for issue in opened_issues: + # Double check creation time (though 'since' should handle it) + # Note: get_issues() returns PRs as well. We need to filter later if needed. + # For now, count all items returned by get_issues as potential "issues" opened. + issues_opened_count += 1 + + # Issues closed: Need to check closed_at time + # Get recently updated closed issues/PRs + recently_updated_closed_items = repo.get_issues( + state="closed", sort="updated", direction="desc", since=cutoff_datetime_aware + ) + for item in recently_updated_closed_items: + if item.closed_at and item.closed_at >= cutoff_datetime_aware: + # Check if it's actually an Issue (not a PR) + # An item is a PR if it has the 'pull_request' attribute + if not hasattr(item, "pull_request") or not item.pull_request: + issues_closed_count += 1 + + metrics["issues_opened_last_period"] = issues_opened_count + metrics["issues_closed_last_period"] = issues_closed_count + print(f"Found: Opened={issues_opened_count}, Closed={issues_closed_count}") + +except GithubException as e: + print(f"Warning: Could not calculate issue metrics: {e}") + metrics["issues_opened_last_period"] = None + metrics["issues_closed_last_period"] = None + +# --- Pull Requests Opened/Closed Last Period --- +print(f"\nCalculating PRs opened/closed in the last {lookback_days} day(s)...") +prs_opened_count = 0 +prs_closed_count = 0 # Includes merged PRs +prs_merged_count = 0 +try: + # PRs opened: Use 'since' on pulls endpoint + opened_pulls = repo.get_pulls( + state="all", sort="created", direction="desc", base=repo.default_branch + ) # Filter by base branch if desired + # Iterate and filter by date - 'since' might not exist for pulls in PyGithub < 2.0 or work as expected + for pr in opened_pulls: + if pr.created_at >= cutoff_datetime_aware: + prs_opened_count += 1 + else: + break # Since sorted by created desc + + # PRs closed/merged: Check closed_at/merged_at + # Get recently updated closed PRs + recently_updated_closed_pulls = repo.get_pulls( + state="closed", sort="updated", direction="desc" + ) # , since=cutoff_datetime_aware) # since might not work well here + for pr in recently_updated_closed_pulls: + # Stop checking if PRs updated date is older than cutoff + if pr.updated_at < cutoff_datetime_aware: + break + + if pr.closed_at and pr.closed_at >= cutoff_datetime_aware: + prs_closed_count += 1 + if pr.merged_at and pr.merged_at >= cutoff_datetime_aware: + prs_merged_count += 1 + + metrics["prs_opened_last_period"] = prs_opened_count + metrics["prs_closed_last_period"] = prs_closed_count + metrics["prs_merged_last_period"] = prs_merged_count + print( + f"Found: Opened={prs_opened_count}, Closed={prs_closed_count}, Merged={prs_merged_count}" + ) + +except GithubException as e: + print(f"Warning: Could not calculate PR metrics: {e}") + metrics["prs_opened_last_period"] = None + metrics["prs_closed_last_period"] = None + metrics["prs_merged_last_period"] = None + + +# --- Comments (Issues, PRs) Last Period --- +print(f"\nCalculating Issue/PR Comments in the last {lookback_days} day(s)...") +issue_comments_last_period = 0 +pr_comments_last_period = 0 # Includes review comments and general PR comments + +try: + # General Issue/PR comments (use issues endpoint) + all_comments = repo.get_issues_comments( + sort="created", direction="desc", since=cutoff_datetime_aware + ) + for comment in all_comments: + # Check creation date again just to be sure + if comment.created_at >= cutoff_datetime_aware: + # Differentiate based on URL + if "/pull/" in comment.html_url: + pr_comments_last_period += 1 + else: + issue_comments_last_period += 1 + else: + # Since comments are sorted desc, we can stop + break + + # PR Review Comments + review_comments = repo.get_pulls_comments( + sort="created", direction="desc", since=cutoff_datetime_aware + ) + for comment in review_comments: + if comment.created_at >= cutoff_datetime_aware: + pr_comments_last_period += 1 + else: + break + + metrics["issue_comments_last_period"] = issue_comments_last_period + metrics["pr_comments_last_period"] = pr_comments_last_period # Combined count + print( + f"Found: Issue Comments={issue_comments_last_period}, PR Comments={pr_comments_last_period}" + ) + +except GithubException as e: + print(f"Warning: Could not calculate comment metrics: {e}") + metrics["issue_comments_last_period"] = None + metrics["pr_comments_last_period"] = None + + +# --- Discussions Metrics (via GraphQL) --- +print( + f"\nCalculating Discussion Metrics for the last {lookback_days} day(s) via GraphQL..." +) + +# Initialize metrics +metrics["discussions_opened_last_period"] = None +metrics["discussions_comments_last_period"] = None # Still challenging + +# Check if discussions are enabled first +if metrics.get("has_discussions"): + # Format the cutoff date as an ISO 8601 string for GraphQL + since_iso_string = cutoff_datetime_aware.isoformat() + + # --- Query for Discussions Opened --- + # Use the search API via GraphQL for efficient filtering by creation date + search_query_string = ( + f"repo:{repo_name} type:discussion is:open created:>={since_iso_string}" + ) + # Also count closed ones created in the period? Add another query or adjust logic if needed. + # This query counts currently 'open' discussions created since the cutoff. + + discussions_search_query = """ + query($searchQuery: String!) { + search(query: $searchQuery, type: DISCUSSION, first: 0) { + discussionCount + } + } + """ + variables = {"searchQuery": search_query_string} + + print( + f"Running GraphQL search for new discussions with query: '{search_query_string}'" + ) + graphql_data_disc = run_graphql_query(token, discussions_search_query, variables) + + if graphql_data_disc and "search" in graphql_data_disc: + try: + metrics["discussions_opened_last_period"] = graphql_data_disc["search"][ + "discussionCount" + ] + print( + f"Found via GraphQL Search: Discussions Opened={metrics['discussions_opened_last_period']}" + ) + except (KeyError, TypeError) as e: + print( + f"Warning: Could not extract discussion count from GraphQL response: {e}. Response: {graphql_data_disc}" + ) + else: + print("Warning: Failed to get discussion count via GraphQL search.") + + # --- Query for Discussion Comments --- + # NOTE: Getting an exact count of *all* comments across *all* discussions created + # within a specific time window using a single, efficient GraphQL query is difficult. + # The search API doesn't seem to support filtering for `type:discussioncomment` directly. + # A possible (but potentially slow and complex) alternative would be: + # 1. Fetch discussions UPDATED since the cutoff date. + # 2. For each discussion, fetch comments created since the cutoff date (requires pagination per discussion). + # This approach can lead to many API calls and is not implemented here for efficiency. + # We will leave `discussions_comments_last_period` as None. + + print( + "Note: Fetching discussion *comment* counts for the period is complex with GraphQL Search and not implemented." + ) + metrics["discussions_comments_last_period"] = None # Explicitly set to None + +else: + print( + "Discussions feature not enabled for this repository. Skipping GraphQL calls." + ) + # Ensure metrics are None if discussions are disabled + metrics["discussions_opened_last_period"] = None + metrics["discussions_comments_last_period"] = None + +# --- Final Data Preparation --- +# Convert datetime objects to string or ensure pyarrow handles them +for key, value in metrics.items(): + if isinstance(value, datetime): + # Ensure timezone-aware datetimes are handled correctly by pyarrow + # Or convert to ISO format string with timezone + if value.tzinfo is None: + # Optional: Make naive datetimes timezone-aware (assuming UTC) before writing + # metrics[key] = value.replace(tzinfo=timezone.utc) + pass # Keep naive UTC as is + # else: keep timezone-aware datetimes as is + # Alternatively, convert all to ISO strings: + # metrics[key] = value.isoformat() + + +# Create DataFrame - Note the list around the dictionary for a single row +try: + df = pd.DataFrame([metrics]) + + # Define specific data types (especially nullable integers) + # Adjust based on the actual metrics collected + dtype_mapping = { + "stars": pd.Int64Dtype(), + "watchers": pd.Int64Dtype(), + "forks_total": pd.Int64Dtype(), + "open_issues_total": pd.Int64Dtype(), + "network_count": pd.Int64Dtype(), + "size_kb": pd.Int64Dtype(), + "contributors_count_total": pd.Int64Dtype(), + "releases_count_total": pd.Int64Dtype(), + "forks_new_last_period": pd.Int64Dtype(), + "contributors_additions_recent_weeks": pd.Int64Dtype(), + "traffic_views_last_day_total": pd.Int64Dtype(), + "traffic_views_last_day_unique": pd.Int64Dtype(), + "traffic_clones_last_day_total": pd.Int64Dtype(), + "traffic_clones_last_day_unique": pd.Int64Dtype(), + # 'traffic_referrers_top_count': pd.Int64Dtype(), 'traffic_popular_paths_top_count': pd.Int64Dtype(), + "issues_opened_last_period": pd.Int64Dtype(), + "issues_closed_last_period": pd.Int64Dtype(), + "prs_opened_last_period": pd.Int64Dtype(), + "prs_closed_last_period": pd.Int64Dtype(), + "prs_merged_last_period": pd.Int64Dtype(), + "issue_comments_last_period": pd.Int64Dtype(), + "pr_comments_last_period": pd.Int64Dtype(), + "discussions_opened_last_period": pd.Int64Dtype(), + "discussions_comments_last_period": pd.Int64Dtype(), + # Add others as needed, boolean types usually fine, string types usually fine + } + # Filter out any keys from mapping that don't exist in the DataFrame (e.g., due to API errors) + valid_dtype_mapping = {k: v for k, v in dtype_mapping.items() if k in df.columns} + df = df.astype(valid_dtype_mapping) + + print("\n--- Collected Metrics ---") + # Print columns horizontally for better readability if many columns + pd.set_option("display.max_columns", None) # Show all columns + pd.set_option("display.width", 1000) # Adjust width as needed + print(df) + + # --- Write to Parquet --- + print(f"\nWriting metrics to {output_filename}...") + df.to_parquet( + output_filename, + engine="pyarrow", + index=False, + coerce_timestamps="us", + allow_truncated_timestamps=False, + ) + print("Successfully wrote Parquet file.") + +except Exception as e: + print(f"Error creating DataFrame or writing Parquet file: {e}") + import traceback + + traceback.print_exc() # Print full traceback for debugging + sys.exit(1) + +print("\nScript finished successfully.") diff --git a/.github/workflows/collect_metrics.yml b/.github/workflows/collect_metrics.yml new file mode 100644 index 000000000..8882e8ece --- /dev/null +++ b/.github/workflows/collect_metrics.yml @@ -0,0 +1,106 @@ +# Collects various metrics from a GitHub repository using the GitHub API. +# +# Copyright 2025 +# SPDX-License-Identifier: Apache-2.0 +# Authors: Trevor Grant +# +# Metrics include: +# - Standard repository attributes (stars, forks, issues, etc.) +# - New forks and contributors in the last period +# - Traffic data (views, clones, referrers, paths) +# - Issues and pull requests opened/closed in the last period +# - Comments on issues and pull requests in the last period +# - Discussions metrics via GraphQL +# Outputs the collected metrics to a Parquet file. +# +# This is uploaded to the GitHub AI Alliance repository for metrics collection. + +name: Collect Repo Metrics and Upload to S3 + +on: + workflow_dispatch: # Allows manual triggering + schedule: + - cron: '0 0 * * *' # Run daily at 5 AM UTC + +jobs: + collect-and-upload-metrics: + runs-on: ubuntu-latest + permissions: + contents: read # Read repo content (needed for repo object) + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' # Or your preferred version + + - name: Install dependencies + run: pip install PyGithub pandas pyarrow + + # --- Configure AWS Credentials --- + # Recommended: Use OpenID Connect (OIDC) if your AWS setup supports it. + # Requires setting up a trust relationship in AWS IAM. + # See: https://github.com/aws-actions/configure-aws-credentials#configure-aws-credentials-action-for-github-actions + # - name: Configure AWS Credentials (OIDC) + # uses: aws-actions/configure-aws-credentials@v4 + # with: + # role-to-assume: arn:aws:iam::ACCOUNT-ID-WITHOUT-HYPHENS:role/YOUR_GITHUB_ACTIONS_ROLE # Replace with your IAM role ARN + # aws-region: ${{ secrets.AWS_REGION }} # e.g., us-east-1 + + # Alternative: Use Access Keys (Store as GitHub Secrets) + - name: Configure AWS Credentials (Access Keys) + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} # e.g., us-east-1 + + # --- Run Python Script to Generate Parquet --- + - name: Run metrics collection script + id: collect_metrics # Give the step an id to potentially reference output later if needed + env: + GITHUB_TOKEN: ${{ secrets.SPECIAL_GH_TOKEN }} # Use the default action token + # GITHUB_REPOSITORY is automatically set by the runner + run: python .github/scripts/collect_metrics.py # Assuming your script is named this + + # --- Upload Parquet File to S3 --- + - name: Upload Parquet to S3 + env: + AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} + # GITHUB_REPOSITORY format is 'owner/repo' + SOURCE_FILE_PATTERN: "github_metrics_*.parquet" # Pattern from the python script + S3_DESTINATION_FILENAME: "blob.parquet" + run: | + # Check if source file exists + if ! ls $SOURCE_FILE_PATTERN 1> /dev/null 2>&1; then + echo "Error: No parquet file matching '$SOURCE_FILE_PATTERN' found." + exit 1 + fi + # Assume only one file matches the pattern from the script run + SOURCE_FILE=$(ls $SOURCE_FILE_PATTERN) + + # Format repository name for S3 path (replace '/' with '-') + # Adjust sed expression if your repo names need different handling + REPOSITORY_NAME_FORMATTED=$(echo "${{ github.repository }}" | sed 's/\//-/g') + + # Get current date in YYYY-MM-DD format + CURRENT_DATE=$(date +%Y-%m-%d) + + # Construct the S3 destination path + S3_PATH="s3://${AWS_S3_BUCKET}/service=github/repository=${REPOSITORY_NAME_FORMATTED}/date=${CURRENT_DATE}/${S3_DESTINATION_FILENAME}" + + echo "Uploading '$SOURCE_FILE' to '$S3_PATH'..." + aws s3 cp "$SOURCE_FILE" "$S3_PATH" + echo "Upload to S3 complete." + + # --- Optional: Upload Parquet artifact to GitHub Actions --- + # Useful for debugging or if you need the file directly from the Actions run + - name: Upload Parquet artifact (Optional) + if: always() # Run even if S3 upload fails, for debugging + uses: actions/upload-artifact@v4 + with: + name: github-metrics-parquet + path: github_metrics_*.parquet # Upload the generated parquet file \ No newline at end of file From 5bd6bef0b85a23e44b0dfbd4b2a21021094ac11b Mon Sep 17 00:00:00 2001 From: Joe Olson Date: Fri, 8 Aug 2025 16:09:32 -0500 Subject: [PATCH 2/2] Closes #292 Signed-off-by: Joe Olson --- .github/workflows/collect_metrics.yml | 184 +++++++++++++------------- 1 file changed, 92 insertions(+), 92 deletions(-) diff --git a/.github/workflows/collect_metrics.yml b/.github/workflows/collect_metrics.yml index 8882e8ece..c99dca6d6 100644 --- a/.github/workflows/collect_metrics.yml +++ b/.github/workflows/collect_metrics.yml @@ -1,9 +1,9 @@ # Collects various metrics from a GitHub repository using the GitHub API. -# +# # Copyright 2025 # SPDX-License-Identifier: Apache-2.0 # Authors: Trevor Grant -# +# # Metrics include: # - Standard repository attributes (stars, forks, issues, etc.) # - New forks and contributors in the last period @@ -12,95 +12,95 @@ # - Comments on issues and pull requests in the last period # - Discussions metrics via GraphQL # Outputs the collected metrics to a Parquet file. -# +# # This is uploaded to the GitHub AI Alliance repository for metrics collection. -name: Collect Repo Metrics and Upload to S3 - -on: - workflow_dispatch: # Allows manual triggering - schedule: - - cron: '0 0 * * *' # Run daily at 5 AM UTC - -jobs: - collect-and-upload-metrics: - runs-on: ubuntu-latest - permissions: - contents: read # Read repo content (needed for repo object) - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' # Or your preferred version - - - name: Install dependencies - run: pip install PyGithub pandas pyarrow - - # --- Configure AWS Credentials --- - # Recommended: Use OpenID Connect (OIDC) if your AWS setup supports it. - # Requires setting up a trust relationship in AWS IAM. - # See: https://github.com/aws-actions/configure-aws-credentials#configure-aws-credentials-action-for-github-actions - # - name: Configure AWS Credentials (OIDC) - # uses: aws-actions/configure-aws-credentials@v4 - # with: - # role-to-assume: arn:aws:iam::ACCOUNT-ID-WITHOUT-HYPHENS:role/YOUR_GITHUB_ACTIONS_ROLE # Replace with your IAM role ARN - # aws-region: ${{ secrets.AWS_REGION }} # e.g., us-east-1 - - # Alternative: Use Access Keys (Store as GitHub Secrets) - - name: Configure AWS Credentials (Access Keys) - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} # e.g., us-east-1 - - # --- Run Python Script to Generate Parquet --- - - name: Run metrics collection script - id: collect_metrics # Give the step an id to potentially reference output later if needed - env: - GITHUB_TOKEN: ${{ secrets.SPECIAL_GH_TOKEN }} # Use the default action token - # GITHUB_REPOSITORY is automatically set by the runner - run: python .github/scripts/collect_metrics.py # Assuming your script is named this - - # --- Upload Parquet File to S3 --- - - name: Upload Parquet to S3 - env: - AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} - # GITHUB_REPOSITORY format is 'owner/repo' - SOURCE_FILE_PATTERN: "github_metrics_*.parquet" # Pattern from the python script - S3_DESTINATION_FILENAME: "blob.parquet" - run: | - # Check if source file exists - if ! ls $SOURCE_FILE_PATTERN 1> /dev/null 2>&1; then - echo "Error: No parquet file matching '$SOURCE_FILE_PATTERN' found." - exit 1 - fi - # Assume only one file matches the pattern from the script run - SOURCE_FILE=$(ls $SOURCE_FILE_PATTERN) - - # Format repository name for S3 path (replace '/' with '-') - # Adjust sed expression if your repo names need different handling - REPOSITORY_NAME_FORMATTED=$(echo "${{ github.repository }}" | sed 's/\//-/g') - - # Get current date in YYYY-MM-DD format - CURRENT_DATE=$(date +%Y-%m-%d) - - # Construct the S3 destination path - S3_PATH="s3://${AWS_S3_BUCKET}/service=github/repository=${REPOSITORY_NAME_FORMATTED}/date=${CURRENT_DATE}/${S3_DESTINATION_FILENAME}" - - echo "Uploading '$SOURCE_FILE' to '$S3_PATH'..." - aws s3 cp "$SOURCE_FILE" "$S3_PATH" - echo "Upload to S3 complete." - - # --- Optional: Upload Parquet artifact to GitHub Actions --- - # Useful for debugging or if you need the file directly from the Actions run - - name: Upload Parquet artifact (Optional) - if: always() # Run even if S3 upload fails, for debugging - uses: actions/upload-artifact@v4 - with: - name: github-metrics-parquet - path: github_metrics_*.parquet # Upload the generated parquet file \ No newline at end of file +name: Collect Repo Metrics and Upload to S3 + +on: + workflow_dispatch: # Allows manual triggering + schedule: + - cron: '0 0 * * *' # Run daily at 5 AM UTC + +jobs: + collect-and-upload-metrics: + runs-on: ubuntu-latest + permissions: + contents: read # Read repo content (needed for repo object) + if: github.repository == 'IBM/mcp-context-forge' + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' # Or your preferred version + + - name: Install dependencies + run: pip install PyGithub pandas pyarrow + + # --- Configure AWS Credentials --- + # Recommended: Use OpenID Connect (OIDC) if your AWS setup supports it. + # Requires setting up a trust relationship in AWS IAM. + # See: https://github.com/aws-actions/configure-aws-credentials#configure-aws-credentials-action-for-github-actions + # - name: Configure AWS Credentials (OIDC) + # uses: aws-actions/configure-aws-credentials@v4 + # with: + # role-to-assume: arn:aws:iam::ACCOUNT-ID-WITHOUT-HYPHENS:role/YOUR_GITHUB_ACTIONS_ROLE # Replace with your IAM role ARN + # aws-region: ${{ secrets.AWS_REGION }} # e.g., us-east-1 + + # Alternative: Use Access Keys (Store as GitHub Secrets) + - name: Configure AWS Credentials (Access Keys) + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} # e.g., us-east-1 + + # --- Run Python Script to Generate Parquet --- + - name: Run metrics collection script + id: collect_metrics # Give the step an id to potentially reference output later if needed + env: + GITHUB_TOKEN: ${{ secrets.SPECIAL_GH_TOKEN }} # Use the default action token + # GITHUB_REPOSITORY is automatically set by the runner + run: python .github/tools/collect_metrics.py # Assuming your script is named this + + # --- Upload Parquet File to S3 --- + - name: Upload Parquet to S3 + env: + AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} + # GITHUB_REPOSITORY format is 'owner/repo' + SOURCE_FILE_PATTERN: "github_metrics_*.parquet" # Pattern from the python script + S3_DESTINATION_FILENAME: "blob.parquet" + run: | + # Check if source file exists + if ! ls $SOURCE_FILE_PATTERN 1> /dev/null 2>&1; then + echo "Error: No parquet file matching '$SOURCE_FILE_PATTERN' found." + exit 1 + fi + # Assume only one file matches the pattern from the script run + SOURCE_FILE=$(ls $SOURCE_FILE_PATTERN) + + # Format repository name for S3 path (replace '/' with '-') + # Adjust sed expression if your repo names need different handling + REPOSITORY_NAME_FORMATTED=$(echo "${{ github.repository }}" | sed 's/\//-/g') + + # Get current date in YYYY-MM-DD format + CURRENT_DATE=$(date +%Y-%m-%d) + + # Construct the S3 destination path + S3_PATH="s3://${AWS_S3_BUCKET}/service=github/repository=${REPOSITORY_NAME_FORMATTED}/date=${CURRENT_DATE}/${S3_DESTINATION_FILENAME}" + + echo "Uploading '$SOURCE_FILE' to '$S3_PATH'..." + aws s3 cp "$SOURCE_FILE" "$S3_PATH" + echo "Upload to S3 complete." + + # --- Optional: Upload Parquet artifact to GitHub Actions --- + # Useful for debugging or if you need the file directly from the Actions run + - name: Upload Parquet artifact (Optional) + if: always() # Run even if S3 upload fails, for debugging + uses: actions/upload-artifact@v4 + with: + name: github-metrics-parquet + path: github_metrics_*.parquet # Upload the generated parquet file