415 lines
15 KiB
Python
Executable file
415 lines
15 KiB
Python
Executable file
#!/usr/bin/python
|
|
|
|
# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
|
|
"""
|
|
This script crawls crbug. Sort-of.
|
|
Invocation:
|
|
Get all bugs with labels, strings (in summary and/or comments):
|
|
crbug_crawler.py --labels 'one two three'
|
|
--queries '"first query" "second query"'
|
|
|
|
Get baddest open bugs of all time:
|
|
crbug_crawler.py --reap
|
|
|
|
Tips:
|
|
- Label based queries will return faster than text queries.
|
|
- contrib/crbug_shell.py is a wrapper that allows you to incrementally
|
|
filter search results using this script.
|
|
"""
|
|
|
|
import argparse
|
|
import cmd
|
|
import logging
|
|
import sys
|
|
import shlex
|
|
|
|
import common
|
|
from autotest_lib.client.common_lib import global_config
|
|
from autotest_lib.server.cros.dynamic_suite import reporting
|
|
|
|
|
|
def _parse_args(args):
|
|
if not args:
|
|
import crbug_crawler
|
|
logging.error('Improper usage of crbug_crawler: %s',
|
|
crbug_crawler.__doc__)
|
|
sys.exit(1)
|
|
|
|
description = ('Usage: crbug_crawler.py --reap')
|
|
parser = argparse.ArgumentParser(description=description)
|
|
parser.add_argument('--quiet', help=('Turn off logging noise.'),
|
|
action='store_true', default=False)
|
|
parser.add_argument('--num', help='Number of issues to output.', default=10,
|
|
type=int)
|
|
parser.add_argument('--queries',
|
|
help=('Search query. Eg: --queries "%s %s"' %
|
|
('build_Root', 'login')),
|
|
default='')
|
|
parser.add_argument('--labels',
|
|
help=('Search labels. Eg: --labels "%s %s"' %
|
|
('autofiled', 'Pri-1')), default=None)
|
|
parser.add_argument('--reap', help=('Top autofiled bugs ordered by count.'),
|
|
action='store_true', default=False)
|
|
return parser.parse_args(args)
|
|
|
|
|
|
class Update(object):
|
|
"""Class encapsulating fields of an update to a bug.
|
|
"""
|
|
open_statuses = ['Unconfirmed', 'Untriaged', 'Available', 'Assigned',
|
|
'Started', 'ExternalDependency']
|
|
closed_statuses = ['Fixed', 'Verified', 'Duplicate', 'WontFix', 'Archived']
|
|
|
|
def __init__(self, comment='', labels='', status=''):
|
|
self.comment = comment
|
|
self.labels = labels if labels else []
|
|
self.status = status
|
|
|
|
|
|
def __str__(self):
|
|
msg = 'status: %s' % self.status
|
|
if self.labels:
|
|
msg = '%s labels: %s' % (msg, self.labels)
|
|
if self.comment:
|
|
msg = '%s comment: %s' % (msg, self.comment)
|
|
return msg
|
|
|
|
|
|
class UpdateManager(object):
|
|
"""Update manager that allows you to revert status updates.
|
|
|
|
This class keeps track of the last update applied and is capable
|
|
of reverting it.
|
|
"""
|
|
|
|
def __init__(self, autocommit=False):
|
|
"""Initialize update manager.
|
|
|
|
@param autocommit: If False just print out the update instead
|
|
of committing it.
|
|
"""
|
|
self.history = {}
|
|
self.present = {}
|
|
self.reporter = reporting.Reporter()
|
|
self.phapi_lib = self.reporter.get_bug_tracker_client()
|
|
self.autocommit = autocommit
|
|
|
|
|
|
def revert(self):
|
|
"""Only manages status reverts as of now.
|
|
"""
|
|
for issue_id, update in self.history.iteritems():
|
|
logging.warning('You will have to manually update %s and %s on %s',
|
|
self.present[issue_id].labels,
|
|
self.present[issue_id].comment, issue_id)
|
|
# Create a new update with just the status.
|
|
self.update(issue_id, Update(status=update.status))
|
|
|
|
|
|
def update(self, old_issue, update):
|
|
"""Record the state of an issue before updating it.
|
|
|
|
@param old_issue: The issue to update. If an id is specified an
|
|
issue is constructed. If an issue object (as defined in phapi_lib
|
|
Issue)is passed in, it is used directly.
|
|
@param update: The Update object to apply to the issue.
|
|
"""
|
|
if type(old_issue) == int:
|
|
old_issue = self.phapi_lib.get_tracker_issue_by_id(old_issue)
|
|
old_update = Update(
|
|
labels=old_issue.labels, status=old_issue.status)
|
|
|
|
if not update.status:
|
|
update.status = old_update.status
|
|
elif (update.status not in Update.open_statuses and
|
|
update.status not in Update.closed_statuses):
|
|
raise ValueError('Unknown status %s' % update.status)
|
|
|
|
if not self.autocommit:
|
|
logging.warning('Would have applied the following update: '
|
|
'%s -> %s', old_update, update)
|
|
return
|
|
|
|
self.history[old_issue.id] = old_update
|
|
self.reporter.modify_bug_report(
|
|
issue_id=old_issue.id, comment=update.comment,
|
|
label_update=update.labels,
|
|
status=update.status)
|
|
self.present[old_issue.id] = update
|
|
|
|
|
|
class Crawler(object):
|
|
"""Class capable of crawling crbug.
|
|
|
|
This class applies filters to issues it crawls and caches them locally.
|
|
"""
|
|
|
|
# The limit at which we ask for confirmation to proceed with the crawl.
|
|
PROMPT_LIMIT = 2000
|
|
|
|
def __init__(self):
|
|
self.reporter = reporting.Reporter()
|
|
self.phapi_client = self.reporter.get_bug_tracker_client()
|
|
self.issues = None
|
|
self.all_autofiled_query = 'ANCHOR TestFailure'
|
|
self.all_autofiled_label = 'autofiled'
|
|
self.prompted = False
|
|
|
|
|
|
def fuzzy_search(self, query='', label='', fast=True):
|
|
"""Returns all issues using one query and/or one label.
|
|
|
|
@param query: A string representing the query.
|
|
@param label: A string representing the label.
|
|
@param fast: If true, don't bother fetching comments.
|
|
|
|
@return: A list of issues matching the query. If fast is
|
|
specified the issues won't have comments.
|
|
"""
|
|
if not query and not label:
|
|
raise ValueError('Require query or labels to make a tracker query, '
|
|
'try query = "%s" or one of the predefined labels %s' %
|
|
(self.fuzzy_search_anchor(),
|
|
self.reporter._PREDEFINED_LABELS))
|
|
if type(label) != str:
|
|
raise ValueError('The crawler only supports one label per query, '
|
|
'and it must be a string. you supplied %s' % label)
|
|
return self.phapi_client.get_tracker_issues_by_text(
|
|
query, label=label, full_text=not fast)
|
|
|
|
|
|
@staticmethod
|
|
def _get_autofiled_count(issue):
|
|
"""Return the autofiled count.
|
|
|
|
@param issue: An issue object that has labels.
|
|
|
|
@return: An integer representing the autofiled count.
|
|
"""
|
|
for label in issue.labels:
|
|
if 'autofiled-count-' in label:
|
|
return int(label.replace('autofiled-count-', ''))
|
|
|
|
# Force bugs without autofiled-count to sink
|
|
return 0
|
|
|
|
|
|
def _prompt_crawl(self, new_issues, start_index):
|
|
"""Warn the user that a crawl is getting large.
|
|
|
|
This method prompts for a y/n answer in case the user wants to abort the
|
|
crawl and specify another set of labels/queries.
|
|
|
|
@param new_issues: A list of issues used with the start_index to
|
|
determine the number of issues already processed.
|
|
@param start_index: The start index of the next crawl iteration.
|
|
"""
|
|
logging.warning('Found %s issues, Crawling issues starting from %s',
|
|
len(new_issues), start_index)
|
|
if start_index > self.PROMPT_LIMIT and not self.prompted:
|
|
logging.warning('Already crawled %s issues, it is possible that'
|
|
'you\'ve specified a very general label. If this is the '
|
|
'case consider re-rodering the labels so they start with '
|
|
'the rarest. Continue crawling [y/n]?',
|
|
start_index + len(new_issues))
|
|
self.prompted = raw_input() == 'y'
|
|
if not self.prompted:
|
|
sys.exit(0)
|
|
|
|
|
|
def exhaustive_crawl(self, query='', label='', fast=True):
|
|
"""Perform an exhaustive crawl using one label and query string.
|
|
|
|
@param query: A string representing one query.
|
|
@param lable: A string representing one label.
|
|
|
|
@return A list of issues sorted by descending autofiled count.
|
|
"""
|
|
start_index = 0
|
|
self.phapi_client.set_max_results(200)
|
|
logging.warning('Performing an exhaustive crawl with label %s query %s',
|
|
label, query)
|
|
vague_issues = []
|
|
new_issues = self.fuzzy_search(query=query, label=label, fast=fast)
|
|
while new_issues:
|
|
vague_issues += new_issues
|
|
start_index += len(new_issues) + 1
|
|
self.phapi_client.set_start_index(start_index)
|
|
new_issues = self.fuzzy_search(query=query, label=label,
|
|
fast=fast)
|
|
self._prompt_crawl(new_issues, start_index)
|
|
|
|
# Subsequent calls will clear the issues cache with new results.
|
|
self.phapi_client.set_start_index(1)
|
|
return sorted(vague_issues, reverse=True,
|
|
key=lambda issue: self._get_autofiled_count(issue))
|
|
|
|
|
|
@staticmethod
|
|
def filter_labels(issues, labels):
|
|
"""Takes a list of labels and returns matching issues.
|
|
|
|
@param issues: A list of issues to parse for labels.
|
|
@param labels: A list of labels to match.
|
|
|
|
@return: A list of matching issues. The issues must contain
|
|
all the labels specified.
|
|
"""
|
|
if not labels:
|
|
return issues
|
|
matching_issues = set([])
|
|
labels = set(labels)
|
|
for issue in issues:
|
|
issue_labels = set(issue.labels)
|
|
if issue_labels.issuperset(labels):
|
|
matching_issues.add(issue)
|
|
return matching_issues
|
|
|
|
|
|
@classmethod
|
|
def does_query_match(cls, issue, query):
|
|
"""Check if a query matches the given issue.
|
|
|
|
@param issue: The issue to check.
|
|
@param query: The query to check against.
|
|
|
|
@return: True if the query matches, false otherwise.
|
|
"""
|
|
if query in issue.title or query in issue.summary:
|
|
return True
|
|
# We can only search comments if the issue is a complete issue
|
|
# i.e as defined in phapi_lib.Issue.
|
|
try:
|
|
if any(query in comment for comment in issue.comments):
|
|
return True
|
|
except (AttributeError, TypeError):
|
|
pass
|
|
return False
|
|
|
|
|
|
@classmethod
|
|
def filter_queries(cls, issues, queries):
|
|
"""Take a list of queries and returns matching issues.
|
|
|
|
@param issues: A list of issues to parse. If the issues contain
|
|
comments and a query is not in the issues title or summmary,
|
|
the comments are parsed for a substring match.
|
|
@param queries: A list of queries to parse the issues for.
|
|
This method looks for an exact substring match within each issue.
|
|
|
|
@return: A list of matching issues.
|
|
"""
|
|
if not queries:
|
|
return issues
|
|
matching_issues = set([])
|
|
for issue in issues:
|
|
# For each query, check if it's in the title, description or
|
|
# comments. If a query isn't in any of these, discard the issue.
|
|
for query in queries:
|
|
if cls.does_query_match(issue, query):
|
|
matching_issues.add(issue)
|
|
else:
|
|
if issue in matching_issues:
|
|
logging.warning('%s: %s\n \tPassed a subset of the '
|
|
'queries but failed query %s',
|
|
issue.id, issue.title, query)
|
|
matching_issues.remove(issue)
|
|
break
|
|
return matching_issues
|
|
|
|
|
|
def filter_issues(self, queries='', labels=None, fast=True):
|
|
"""Run the queries, labels filters by crawling crbug.
|
|
|
|
@param queries: A space seperated string of queries, usually passed
|
|
through the command line.
|
|
@param labels: A space seperated string of labels, usually passed
|
|
through the command line.
|
|
@param fast: If specified, skip creating comments for issues since this
|
|
can be a slow process. This value is only a suggestion, since it is
|
|
ignored if multiple queries are specified.
|
|
"""
|
|
queries = shlex.split(queries)
|
|
labels = shlex.split(labels) if labels else None
|
|
|
|
# We'll need comments to filter multiple queries.
|
|
if len(queries) > 1:
|
|
fast = False
|
|
matching_issues = self.exhaustive_crawl(
|
|
query=queries.pop(0) if queries else '',
|
|
label=labels.pop(0) if labels else '', fast=fast)
|
|
matching_issues = self.filter_labels(matching_issues, labels)
|
|
matching_issues = self.filter_queries(matching_issues, queries)
|
|
self.issues = list(matching_issues)
|
|
|
|
|
|
def dump_issues(self, limit=None):
|
|
"""Print issues.
|
|
"""
|
|
if limit and limit < len(self.issues):
|
|
issues = self.issues[:limit]
|
|
else:
|
|
issues = self.issues
|
|
#TODO: Modify formatting, include some paging etc.
|
|
for issue in issues:
|
|
try:
|
|
print ('[%s] %s crbug.com/%s %s' %
|
|
(self._get_autofiled_count(issue),
|
|
issue.status, issue.id, issue.title))
|
|
except UnicodeEncodeError as e:
|
|
print "Unicdoe error decoding issue id %s" % issue.id
|
|
continue
|
|
|
|
|
|
def _update_test(args):
|
|
"""A simple update test, to record usage.
|
|
"""
|
|
updater = UpdateManager(autocommit=True)
|
|
for issue in issues:
|
|
updater.update(issue,
|
|
Update(comment='this is bogus', labels=['bogus'],
|
|
status='Assigned'))
|
|
updater.revert()
|
|
|
|
|
|
def configure_logging(quiet=False):
|
|
"""Configure logging.
|
|
|
|
@param quiet: True to turn off warning messages.
|
|
"""
|
|
logging.basicConfig()
|
|
logger = logging.getLogger()
|
|
level = logging.WARNING
|
|
if quiet:
|
|
level = logging.ERROR
|
|
logger.setLevel(level)
|
|
|
|
|
|
def main(args):
|
|
crawler = Crawler()
|
|
if args.reap:
|
|
if args.queries or args.labels:
|
|
logging.error('Query based ranking of bugs not supported yet.')
|
|
return
|
|
queries = ''
|
|
labels = crawler.all_autofiled_label
|
|
else:
|
|
queries = args.queries
|
|
labels = args.labels
|
|
crawler.filter_issues(queries=queries, labels=labels,
|
|
fast=False if queries else True)
|
|
crawler.dump_issues(int(args.num))
|
|
logging.warning('\nThis is a truncated list of %s results, use --num %s '
|
|
'to get them all. If you want more informative results/better '
|
|
'querying capabilities try crbug_shell.py.',
|
|
args.num, len(crawler.issues))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
args = _parse_args(sys.argv[1:])
|
|
configure_logging(args.quiet)
|
|
main(args)
|
|
|