#!/usr/bin/env python3
import matplotlib.pyplot as plt
import xdg
import re
import pycurl
import certifi
import os
import sys
from bs4 import BeautifulSoup
from io import BytesIO
# where we get the bench list from
# sadly message edits don't appear in the public archive so we have to
# download the gitlab raw logs too
# (maybe we should be getting the messages from the zulip API instead??)
archive = '
https://coq.gitlab.io/zulip-archive/stream/240008-GitHub-notifications/topic/Bench.20Notifications.html'
print ('Getting bench list.')
buffer = BytesIO()
c = pycurl.Curl()
c.setopt(c.URL, archive)
c.setopt(c.WRITEFUNCTION, buffer.write)
c.setopt(c.CAINFO, certifi.where())
c.perform()
c.close()
str = buffer.getvalue().decode('utf8')
print ('Parsing bench list.')
# parse HTML and remove tags
# NB since coq was made default language on zulip the tables are full of <span> tags
# so we really don't want to keep them
str = BeautifulSoup(str, "html.parser").text
# NB: the line with the date starts with a space
# (in the original there is the avatar broken image)
datere = re.compile(' Bench bot \((.*)\):')
jobre = re.compile('Bench at
https://gitlab.com/coq/coq/-/jobs/(.*)$')
benches=[]
curdate=None
for line in iter(str.splitlines()):
match = re.match(datere, line)
if match:
# Next post
curdate = match[1]
else:
match = re.match(jobre, line)
if match:
job = match[1]
benches += [(curdate, match[1])]
cachedir = xdg.xdg_cache_home() / "coq-bench-plotter"
os.makedirs(cachedir, exist_ok=True)
## download job logs
print ('Downloading job logs.')
m = pycurl.CurlMulti()
m.setopt(pycurl.M_MAX_HOST_CONNECTIONS, 8)
files=[]
for _, job in benches:
fname = cachedir / (job + '.log')
if not(os.path.exists(fname)):
f = open(fname, 'xb') # x: will error if already exists
files += [f]
c = pycurl.Curl()
c.setopt(c.URL, '
https://gitlab.com/coq/coq/-/jobs/' + job + '/raw')
c.setopt(c.WRITEFUNCTION, f.write)
c.setopt(c.CAINFO, certifi.where())
c.setopt(c.FOLLOWLOCATION, True)
m.add_handle(c)
num_handles = len(files)
while num_handles:
print (f'Downloading {num_handles} log files.')
ret = m.select(5.0)
if ret == -1:
continue
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
for f in files:
f.flush()
f.close()
## parse job logs
print ('Parsing job logs.')
# captures package name and OLD time
# the numerals are to avoid matching the table header
packre = re.compile('│[ ]+([^ ]+) │[ ]+[^ ]+[ ]+([0-9][^ ]*)[ ]+([^ ]+)')
parsed=[]
for date, job in benches:
cur={}
fname = cachedir / (job + '.log')
with open(fname) as f:
for l in f:
match = re.match(packre, l)
if match:
# the table appears multiple times in the log so this may be overriding the value
# that is OK (it's not worth bothering to find the last table)
cur[match[1]] = (match[2], match[3])
if cur:
parsed += [(date, job, cur)]
def filter_to(packname):
dates=[]
jobs=[]
times=[]
changes=[]
for date, job, packs in parsed:
if packname in packs:
dates += [date]
jobs += [job]
time, change = packs[packname]
times += [float(time)]
changes += [float(change)]
return dates, jobs, times, changes
def plot(packname):
dates, jobs, times, changes = filter_to(packname)
# alternatively you can use dates for x axis, times for y axis
# with x = jobs, if you find an interesting point, hover the mouse on it
# pyplot will say somewhere in the window "x=..., y=..."
# now if you don't want to read and type the whole 10 digit number
# export as svg and search for the last 3 digits followed by a space
# it should be easy to find a xml comment with the job number eg "<!-- 2100592270 -->"
plt.plot(jobs, changes)
plt.ylabel(packname)
plt.show()
def list_packs():
known={}
for _, _, packs in parsed:
for pack in packs.keys():
if pack in known.keys():
known[pack] += 1
else:
known[pack] = 1
return known
plot(sys.argv[1])