Skip to main content

recent-github-commits.ipynb (Source)

List all commits in git repositories since a given time

This script will find all git repositories under a given root directory, and then list all commits made to those repositories since a given time.

Repositories are identified by the GitHub remote URL: when you have more than one copy of a repository, the one closest to the root will be used and the rest will be ignored.

In [ ]:
from pathlib import Path
from datetime import datetime, timezone

# Only show commits after this time.
last_blog = datetime(2023,1,1, tzinfo=timezone.utc)

# The directory to look in.
root = Path('/home/christian/numbas')

# Organisations whose repositories should be included. Repositories belonging to any other organisation will be ignored.
want_organisations = {'numbas', 'christianp'}

# Repositories to ignore, in the form `(organisation, repository)`
ignore_repos = []
In [ ]:
def find_git_repos(root):
    queue = [root]
    
    while len(queue):
        path = queue.pop()
        
        try:
            if (path / '.git').exists():
                yield path

            for p in path.iterdir():
                if not p.is_dir() or p.is_symlink() or p.name.startswith('.'):
                    continue
                queue.append(p)
                
        except PermissionError:
            continue
        
all_repos = sorted(find_git_repos(root),key=lambda p: len(p.parents))
In [ ]:
import configparser
import re

repos = {}
all_organisations = set()

for r in all_repos:
    cp = configparser.ConfigParser()
    cp.read(r / '.git' / 'config')
    for sname in cp.sections():
        m = re.match(r'remote "(.*?)"',sname)
        if not m:
            continue
        remote = m.group(1)
        remote_url = cp[sname]['url']
        #print(remote_url)
        m = re.match(r'(?:git@.*:|https://github.com/)(?P<org>.*)/(?P<repo>.*)$',remote_url)
        if not m:
            continue
        organisation = m.group('org')
        all_organisations.add(organisation)
        repo = m.group('repo')
        if organisation not in want_organisations:
            continue
        key = (organisation, repo)
        #print(key)
        if key not in repos and key not in ignore_repos:
            repos[key] = r
            break
        
print("Found repos belonging to the following organisations:")
all_organisations
In [ ]:
import subprocess
delimiter = '\t!\t'
commits = []
for ((org,repo),r) in repos.items():
    logs = subprocess.run(['git','log',f'--format=format:%H{delimiter}%aI{delimiter}%s'],cwd=r, capture_output=True, encoding='utf-8').stdout.split('\n')
    for sha, datestr, message in [x.split(delimiter) for x in logs]:
        #print(sha)
        date = datetime.fromisoformat(datestr)
        if date < last_blog:
            continue
        
        commits.append( (org,repo,date,sha,message) )
        
commits.sort(key=lambda x:x[2])
In [ ]:
from IPython.display import HTML

table = ''
for org,repo,date,sha,message in commits:
    url = f'''https://github.com/{org}/{repo}/commit/{sha}'''
        
    table += f'''<tr>
    <td>{org}/{repo}</td>
    <td><a href="{url}" target="_blank">{sha[:8]}</a></td>
    <td style="text-align: left; width: 40em;font-family: monospace;">{message}</td>
    <td>{date}</td>
    </tr>'''
    
HTML('<table>'+table+'</table>')