#!/usr/bin/env python3
# This file is part of the LibreOffice project.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# Use this script to retrieve information from https://crashreport.libreoffice.org
# about a specific version of LibreOffice
# Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/
import argparse
import requests
from bs4
import BeautifulSoup
import sys
import os
from datetime
import datetime
import urllib.parse
import re
import git
tableHeader = [
"",
"Name",
"Ratio",
"Count",
"First report",
"Last Report",
"OS",
"Stack",
"Reason",
"Last 4 UNO Commands"]
HtmlHeader = \
' \
<html lang=\
"en\"> \
<head> \
<title>%VERSION% crashes</title> \
<style> \
table { \
width: 100%; \
border-collapse: collapse; \
} \
a { text-decoration:
none; color: inherit; } \
th,
#td1 { \
padding: 8px 12px; \
border: 1px solid
#ccc; \
text-align: left; \
} \
th { \
cursor: pointer; \
background-color:
#f2f2f2; \
} \
th:hover { \
background-color:
#ddd; \
} \
</style> \
</head> \
<body> \
<h2>%VERSION% crashes</h2> \
<script src=
"https://www.kryogenix.org/code/browser/sorttable/sorttable.js"></script>
'
def convert_str_to_date(value):
value = value.replace(
'.',
'')
value = value.replace(
'March',
'Mar')
value = value.replace(
'April',
'Apr')
value = value.replace(
'June',
'Jun')
value = value.replace(
'July',
'Jul')
value = value.replace(
'Sept',
'Sep')
# reset the time leaving the date
value =
", ".join(value.split(
", ")[:-1])
return datetime.strptime(value,
'%b %d, %Y')
def parse_version_url(url):
crashReports = {}
try:
html_text = requests.get(url, timeout=200).text
soup = BeautifulSoup(html_text,
'html.parser')
except requests.exceptions.Timeout:
print(
"Timeout requesting " + url)
sys.exit(1)
table = soup.find(
"table", {
"id":
"data-table"}).tbody
for tr
in table.find_all(
"tr"):
td_list = tr.find_all(
"td")
crashName = td_list[0].a.text.strip()
crashNumber = int(td_list[1].text.strip())
firstCrashDate = convert_str_to_date(td_list[5].text.strip())
lastCrashDate = convert_str_to_date(td_list[6].text.strip())
crashReports[crashName] = [crashNumber, firstCrashDate, lastCrashDate]
return crashReports
def parse_reports_and_get_most_recent_report_from_last_page(url):
try:
html_text = requests.get(url, timeout=200).text
soup = BeautifulSoup(html_text,
'html.parser')
except requests.exceptions.Timeout:
print(
"Timeout")
raise
count = 0
ID, OS =
"",
""
try:
os_tab = soup.find(
"table", {
"id":
"os_tab"}).tbody
except AttributeError:
print(
"os_tab not found")
return count, ID, OS
tr_list = os_tab.find_all(
"tr")
for tr
in tr_list:
td_list = tr.find_all(
"td")
count += int(td_list[1].text.strip())
reports = soup.find(
"div", {
"id":
"reports"}).tbody
version, currentVersion = 0, 0
currentID, currentOS =
"",
""
tr_list = reports.find_all(
"tr")
for tr
in tr_list:
td_list = tr.find_all(
"td")
currentID = td_list[0].a.text.strip()
currentVersion = int(
''.join(re.findall(
"\d+", td_list[2].text)))
currentOS = td_list[3].text.strip()
# get most recent version
# symbols on linux are not very informative generally
if currentOS ==
"windows" and currentVersion > version:
version = currentVersion
ID = currentID
OS = currentOS
if not ID:
ID = currentID
if not OS:
OS = currentOS
return count, ID, OS
def parse_details_and_get_info(url, gitRepo, gitBranch):
try:
html_text = requests.get(url, timeout=200).text
soup = BeautifulSoup(html_text,
'html.parser')
except requests.exceptions.Timeout:
print(
"Timeout")
raise
details = soup.find(
"div", {
"id":
"details-tab-panel"}).tbody
tr_list = details.find_all(
"tr")
reason = tr_list[8].td.text.strip()
stackTable =
""
count = 0
frames = soup.find("div", {"id": "frames"}).tbody
for tr in frames.find_all("tr"):
td_list = tr.find_all("td")
source = td_list[3].text.strip()
if source and count <= 10:
source = source.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "")
codeFile = source.split(":")[0]
codeNumber = source.split(":")[1]
codeLine = ""
try:
with open(os.path.join(gitRepo, codeFile)) as f:
lines = f.readlines()
for index, line in enumerate(lines):
if index + 1 == int(codeNumber):
urlLink = "https://git.libreoffice.org/core/+/" + \
gitBranch + "/" + codeFile + "#" + str(codeNumber)
codeLine += str(count) + ": _blank\" href=\"" + urlLink + "\">" + line.strip().replace("\"", "'") + ""
count += 1
except FileNotFoundError:
continue
codeLine += " |
"
stackTable += codeLine
stackTable += "
"
metadata = soup.find(
"div", {
"id":
"metadata-tab-panel"}).tbody
tr_list = metadata.find_all(
"tr")
unoCommands =
""
for tr
in tr_list:
if tr.th.text.strip() ==
"Last-4-Uno-Commands":
unoCommands = tr.td.text.strip()
return reason, stackTable, unoCommands
if __name__ ==
'__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--version', action=
'store', dest=
"version", required=
True)
parser.add_argument(
'--repository', action=
"store", dest=
"repository", required=
True)
args = parser.parse_args()
gitBranch = git.Repo(args.repository).active_branch.name
crashes = parse_version_url(
"https://crashreport.libreoffice.org/stats/version/" + args.version +
"?limit=1000&days=30"pan>)
print(str(len(crashes)) + " crash reports in version " + args.version)
crashesInFile = []
fileName = "crashes_" + args.version.replace(".", "_") + ".html"
print("Using " + fileName)
with open(fileName, "w") as f:
f.write(HtmlHeader.replace("%VERSION%", args.version))
f.write("sortable\">")
f.write("")
f.write("")
for name in tableHeader:
f.write("" + name + " | ")
f.write("
")
f.write("")
f.flush()
f.write("")
count = 0
for k, lDate in crashes.items():
if k not in crashesInFile:
print("Parsing " + k)
f.write("")
try:
crashCount, crashID, crashOS = parse_reports_and_get_most_recent_report_from_last_page(
"https://crashreport.libreoffice.org/stats/signature/" + urllib.parse.quote(k))
if crashCount == 0:
continue
crashReason, codeStack, unoCommands = parse_details_and_get_info(
"https://crashreport.libreoffice.org/stats/crash_details/" + crashID, args.repository, gitBranch)
ratio = round(crashCount / ((lDate[2] - lDate[1]).days + 1), 2)
count += 1
f.write("td1\">" + str(count) + " | ")
f.write("td1\">_blank\" href=\"https://crashreport.libreoffice.org/stats/crash_details/"
+ crashID + "\">" + k + "</a></b></td>")
f.write(" | td1\">" + str(ratio) + " | ")
f.write("td1\">" + str(crashCount) + " | ")
f.write("td1\">" + lDate[1].strftime('%Y/%m/%d') + " | ")
f.write("td1\">" + lDate[2].strftime('%Y/%m/%d') + " | ")
f.write("td1\">" + crashOS + " | ")
f.write("td1\">" + codeStack + " | ")
f.write("td1\">" + crashReason + " | ")
f.write("td1\">" + unoCommands + " | ")
except (requests.exceptions.Timeout):
continue
f.write("
")
f.flush()
f.write("")
f.write("
")
f.write("