kata-containers · stevenhorsman · May 2, 2025 · May 2, 2025 · May 2, 2025 · May 2, 2025
diff --git a/elections/tools/README.md b/elections/tools/README.md
@@ -22,29 +22,19 @@ $ python3 -m venv .venv
 $ .venv/bin/pip install pytz github3.py pyyaml
 ```
 
-Before running the tool you will need to create a
+Before running the tool you will need to create a personal
 [GitHub API token](https://github.blog/2013-05-16-personal-api-tokens/)
+set as an environment variable called: `GH_TOKEN`
 
-replace `__API_TOKEN__` in the script with your personal token.
-
-Also update the election start and end times to cover the period being
-examined for this election period. The lines to edit look like:
-
-```python
-start_time = datetime.datetime(2018, 1, 1, 0, 0, 0, tzinfo=pytz.UTC)
-end_time = datetime.datetime(2018, 8, 1, 0, 0, 0, tzinfo=pytz.UTC)
-```
-
-Then run the tool with:
+Then run the tool supplying `-end <date of candidate nomination in %d/%m/%y format>`.
+e.g. if the nomination period began on 7th April 2025:
 
 ```bash
-$ .venv/bin/python ./generate_electorate.py
+$ .venv/bin/python ./generate_electorate.py -end 07/04/25
 ```
 
-The code looks at all commits in all Kata Containers repos *except*
-`kata-containers/linux` and `kata-containers/qemu`. As both of these are forks
-(in the GitHub sense) they'll have lots of contributors that may not be Kata
-contributors.
+The code looks at all commits in all the active Kata Containers repos . A number of
+archived/forks repos are ignored to save time/avoid including non Kata contributors.
 
 For contributors that have more than one email address it picks one as default
 but supplies all the others so  we can be smarter about where to send the

diff --git a/elections/tools/generate_electorate.py b/elections/tools/generate_electorate.py
@@ -2,14 +2,17 @@
 
 #
 # Copyright (c) 2023 Kata Contributors
+# Copyright (c) 2025 IBM Corporation
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Description: Generate a list of kata contributors by extracting contact
 # information from GitHub
 
+import argparse
 import datetime
-import pytz
+from datetime import timedelta
+import os
 import re
 import yaml
 
@@ -67,71 +70,96 @@ def _author_representer(dumper, data):
                          commit_count=data.commit_count)
     return dumper.represent_dict(o_dict.items())
 
-
-dco_re = re.compile('signed.off.by[: ]*(?P<name>[^<]*)<(?P<email>.*)>$',
-                    re.IGNORECASE | re.MULTILINE)
-# Get a token GitHub Personal API token see:
-#   https://blog.github.com/2013-05-16-personal-api-tokens/
-# for more information.
-gh = login(token='__API_TOKEN__')
-org = gh.organization('kata-containers')
-# Example dates for testing.
-start_time = datetime.datetime(2018, 1, 1, 0, 0, 0, tzinfo=pytz.UTC)
-end_time = datetime.datetime(2018, 8, 1, 0, 0, 0, tzinfo=pytz.UTC)
-# ... Or run just include all commits
-# start_time = end_time = None
-# All commits
-number = -1
-projects = []
-ignored_repos = [
-        'cgroups-rs',
+def find_authors_by_project(start_time, end_time):
+    dco_re = re.compile('signed.off.by[: ]*(?P<name>[^<]*)<(?P<email>.*)>$',
+                        re.IGNORECASE | re.MULTILINE)
+    # Get a token GitHub Personal API token see:
+    #   https://blog.github.com/2013-05-16-personal-api-tokens/
+    # for more information.
+    try:
+        personal_token=os.environ['GH_TOKEN']
+    except KeyError:
+        raise Exception("GH_TOKEN environment variable was not set")
+
+    gh = login(token=personal_token)
+    org = gh.organization('kata-containers')
+    number = -1
+    projects = []
+    ignored_repos = [
+        'agent',
+        'ci',
         'dbs-snapshot',
+        'documentation',
         'edk2',
-        'qemu',
-        'linux',
-        'project-infra',
         'govmm',
-        'resolve-pr-refs',
         'is-organization-member',
+        'kata-containers-github-actions-tests',
+        'ksm-throttler',
+        'linux',
         'osbuilder',
+        'packaging',
+        'project-infra',
         'proxy',
+        'qemu',
+        'resolve-pr-refs',
         'runtime',
         'shim',
-        'packaging',
-        'ksm-throttler',
-        'documentation',
-        'agent',
         'slash-command-action',
-        'tests-1',
-        'kata-containers-github-actions-tests',
-        'kata-containers-cache-kernel',
-        'kata-containers-2',
-        'kata-containers-1',
-        ]
-
-author_cache = {}
-for repo in org.repositories():
-    # Skip these repos as they are not a core part of the project, and are
-    # forked/imported so contain many contributors from outside the project.
-    if str(repo).split("/")[1] in ignored_repos:
-        print('Skipping repo %s' % (repo))
-        continue
-    print('Looking for changes in %s between %s and %s' %
-          (repo, start_time, end_time))
-
-    authors = AuthorSet()
-    for branch in repo.branches():
-        for commit in repo.commits(sha=branch.name, since=start_time, until=end_time,
-                                   number=number):
+        'tests',
+    ]
+
+    # Let's build a list of the users we can't get the logins for so
+    # we can prompt the runner to update the following map
+    unknown_logins = {}
+
+    # Some committers have emails that don't map to their
+    # userid properly, so maintain a map of these so the data is consistent
+    email_id_map = {
+        "[email protected]":"lifupan",
+        "[email protected]": "Ankita13-code",
+        "[email protected]": "ChengyuZhu6",
+        "[email protected]": "lima-emanuel",
+        "[email protected]": "l8huang",
+        "[email protected]": "Bickor",
+        "[email protected]": "huoqifeng",
+        "[email protected]": "niteeshkd",
+        "[email protected]": "gpyrros",
+        "[email protected]": "cmaf",
+        "[email protected]": "amshinde",
+        "[email protected]": "seungukshin",
+        }
+
+    author_cache = {}
+    for repo in org.repositories():
+        # Skip these repos as they are not a core part of the project, and are
+        # forked/imported/archived so contain many contributors from outside the project.
+        # Also skip the github security advisory repos for quicker processing
+        if str(repo).split("/")[1] in ignored_repos or str(repo).split("/")[1].startswith('kata-containers-ghsa'):
+            print('Skipping repo %s' % (repo))
+            continue
+        print('Looking for changes in %s between %s and %s' %
+            (repo, start_time, end_time))
+
+        authors = AuthorSet()
+        for commit in repo.commits(since=start_time, until=end_time, number=number):
+            # If a commit has >1 parents then it's a merge commit, so skip these
+            if len(commit.parents) > 1:
+                continue
+
             if commit.author is None:
                 if commit.commit.author is None:
                     print('Skipping %s in %s as it has no author. Did this merge via GitHub?' %
-                          (commit, repo))
+                        (commit, repo))
                     continue
 
                 author_id = commit.commit.author.get('email')
-                print('%s in %s as has no author. Using email (%s) as the author id' %
-                      (commit, repo, author_id))
+                if author_id in email_id_map:
+                    author_id = email_id_map[author_id]
+                else:
+                    if not author_id in unknown_logins:
+                        unknown_logins[author_id] = commit.html_url
+                    print('%s in %s as has no author. Using email (%s) as the author id' %
+                (commit, repo, author_id))
             else:
                 author_id = commit.author.login
 
@@ -167,12 +195,39 @@ def _author_representer(dumper, data):
                 if author.name is None and match.group('name'):
                     author.name = match.group('name')
             authors.add(author)
-    projects.append({str(repo): authors})
-
-# Dark YAML voodoo
-yaml.Dumper.ignore_aliases = lambda *args: True
-yaml.Dumper.add_representer(AuthorSet, _authorset_representer)
-yaml.Dumper.add_representer(Author, _author_representer)
-with open('electorate.yaml', 'w') as f:
-    yaml.dump(projects, f, default_flow_style=False, default_style='',
-              explicit_start=True)
+        projects.append({str(repo): authors})
+
+    if len(unknown_logins) > 0:
+        print("Warning: failed to match some emails. Please follow the commit link and add the email -> id " \
+        "map to `email_id_map` in generate_electorate.py:")
+        for email, commit_html in unknown_logins.items():
+            print("Email", email, "who committed", commit_html)
+    return projects
+
+def main():
+
+    parser = argparse.ArgumentParser(description='An electorate generation script')
+    parser.add_argument("-end", required=True,help='the end date of the period to examine in format %%d/%%m/%%y.')
+    parser.add_argument("-start",  help='the start date of the period to examine in format %%d/%%m/%%y.  If not set will default to' \
+    '365 days before the end time')
+
+    args = parser.parse_args()
+    end_time = datetime.datetime.strptime(args.end, '%d/%m/%y')
+    start_time = end_time - timedelta(days=365)
+    if args.start != None:
+        start_time = datetime.datetime.strptime(args.start, '%d/%m/%y')
+
+    print("Getting committers from", start_time, " -> ", end_time)
+
+    projects=find_authors_by_project(start_time, end_time)
+
+    # Dark YAML voodoo
+    yaml.Dumper.ignore_aliases = lambda *args: True
+    yaml.Dumper.add_representer(AuthorSet, _authorset_representer)
+    yaml.Dumper.add_representer(Author, _author_representer)
+    with open('electorate.yaml', 'w') as f:
+        yaml.dump(projects, f, default_flow_style=False, default_style='',
+                explicit_start=True)
+
+if __name__ == '__main__':
+    main()