OSDN Git Service

07846c5c9d7411de72cbc8aa8d55a029e8de3114
[android-x86/external-llvm.git] / utils / Reviewing / find_interesting_reviews.py
1 #!/usr/bin/env python
2
3 import argparse
4 import email.mime.multipart
5 import email.mime.text
6 import logging
7 import os.path
8 import pickle
9 import re
10 import smtplib
11 import subprocess
12 import sys
13 from datetime import datetime, timedelta
14 from phabricator import Phabricator
15
16 # Setting up a virtualenv to run this script can be done by running the
17 # following commands:
18 # $ virtualenv venv
19 # $ . ./venv/bin/activate
20 # $ pip install Phabricator
21
22 GIT_REPO_METADATA = (("llvm", "https://llvm.org/git/llvm.git"), )
23
24 # The below PhabXXX classes represent objects as modelled by Phabricator.
25 # The classes can be serialized to disk, to try and make sure that we don't
26 # needlessly have to re-fetch lots of data from Phabricator, as that would
27 # make this script unusably slow.
28
29
30 class PhabObject:
31     OBJECT_KIND = None
32
33     def __init__(self, id):
34         self.id = id
35
36
37 class PhabObjectCache:
38     def __init__(self, PhabObjectClass):
39         self.PhabObjectClass = PhabObjectClass
40         self.most_recent_info = None
41         self.oldest_info = None
42         self.id2PhabObjects = {}
43
44     def get_name(self):
45         return self.PhabObjectClass.OBJECT_KIND + "sCache"
46
47     def get(self, id):
48         if id not in self.id2PhabObjects:
49             self.id2PhabObjects[id] = self.PhabObjectClass(id)
50         return self.id2PhabObjects[id]
51
52     def get_ids_in_cache(self):
53         return self.id2PhabObjects.keys()
54
55     def get_objects(self):
56         return self.id2PhabObjects.values()
57
58     DEFAULT_DIRECTORY = "PhabObjectCache"
59
60     def _get_pickle_name(self, directory):
61         file_name = "Phab" + self.PhabObjectClass.OBJECT_KIND + "s.pickle"
62         return os.path.join(directory, file_name)
63
64     def populate_cache_from_disk(self, directory=DEFAULT_DIRECTORY):
65         """
66         FIXME: consider if serializing to JSON would bring interoperability
67         advantages over serializing to pickle.
68         """
69         try:
70             f = open(self._get_pickle_name(directory), "rb")
71         except IOError as err:
72             print("Could not find cache. Error message: {0}. Continuing..."
73                   .format(err))
74         else:
75             with f:
76                 try:
77                     d = pickle.load(f)
78                     self.__dict__.update(d)
79                 except EOFError as err:
80                     print("Cache seems to be corrupt. " +
81                           "Not using cache. Error message: {0}".format(err))
82
83     def write_cache_to_disk(self, directory=DEFAULT_DIRECTORY):
84         if not os.path.exists(directory):
85             os.makedirs(directory)
86         with open(self._get_pickle_name(directory), "wb") as f:
87             pickle.dump(self.__dict__, f)
88         print("wrote cache to disk, most_recent_info= {0}".format(
89             datetime.fromtimestamp(self.most_recent_info)
90             if self.most_recent_info is not None else None))
91
92
93 class PhabReview(PhabObject):
94     OBJECT_KIND = "Review"
95
96     def __init__(self, id):
97         PhabObject.__init__(self, id)
98
99     def update(self, title, dateCreated, dateModified, author):
100         self.title = title
101         self.dateCreated = dateCreated
102         self.dateModified = dateModified
103         self.author = author
104
105     def setPhabDiffs(self, phabDiffs):
106         self.phabDiffs = phabDiffs
107
108
109 class PhabUser(PhabObject):
110     OBJECT_KIND = "User"
111
112     def __init__(self, id):
113         PhabObject.__init__(self, id)
114
115     def update(self, phid, realName):
116         self.phid = phid
117         self.realName = realName
118
119
120 class PhabHunk:
121     def __init__(self, rest_api_hunk):
122         self.oldOffset = int(rest_api_hunk["oldOffset"])
123         self.oldLength = int(rest_api_hunk["oldLength"])
124         # self.actual_lines_changed_offset will contain the offsets of the
125         # lines that were changed in this hunk.
126         self.actual_lines_changed_offset = []
127         offset = self.oldOffset
128         inHunk = False
129         hunkStart = -1
130         contextLines = 3
131         for line in rest_api_hunk["corpus"].split("\n"):
132             if line.startswith("+"):
133                 # line is a new line that got introduced in this patch.
134                 # Do not record it as a changed line.
135                 if inHunk is False:
136                     inHunk = True
137                     hunkStart = max(self.oldOffset, offset - contextLines)
138                 continue
139             if line.startswith("-"):
140                 # line was changed or removed from the older version of the
141                 # code. Record it as a changed line.
142                 if inHunk is False:
143                     inHunk = True
144                     hunkStart = max(self.oldOffset, offset - contextLines)
145                 offset += 1
146                 continue
147             # line is a context line.
148             if inHunk is True:
149                 inHunk = False
150                 hunkEnd = offset + contextLines
151                 self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
152             offset += 1
153         if inHunk is True:
154             hunkEnd = offset + contextLines
155             self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
156
157         # The above algorithm could result in adjacent or overlapping ranges
158         # being recorded into self.actual_lines_changed_offset.
159         # Merge the adjacent and overlapping ranges in there:
160         t = []
161         lastRange = None
162         for start, end in self.actual_lines_changed_offset + \
163                 [(sys.maxsize, sys.maxsize)]:
164             if lastRange is None:
165                 lastRange = (start, end)
166             else:
167                 if lastRange[1] >= start:
168                     lastRange = (lastRange[0], end)
169                 else:
170                     t.append(lastRange)
171                     lastRange = (start, end)
172         self.actual_lines_changed_offset = t
173
174
175 class PhabChange:
176     def __init__(self, rest_api_change):
177         self.oldPath = rest_api_change["oldPath"]
178         self.hunks = [PhabHunk(h) for h in rest_api_change["hunks"]]
179
180
181 class PhabDiff(PhabObject):
182     OBJECT_KIND = "Diff"
183
184     def __init__(self, id):
185         PhabObject.__init__(self, id)
186
187     def update(self, rest_api_results):
188         self.revisionID = rest_api_results["revisionID"]
189         self.dateModified = int(rest_api_results["dateModified"])
190         self.dateCreated = int(rest_api_results["dateCreated"])
191         self.changes = [PhabChange(c) for c in rest_api_results["changes"]]
192
193
194 class ReviewsCache(PhabObjectCache):
195     def __init__(self):
196         PhabObjectCache.__init__(self, PhabReview)
197
198
199 class UsersCache(PhabObjectCache):
200     def __init__(self):
201         PhabObjectCache.__init__(self, PhabUser)
202
203
204 reviews_cache = ReviewsCache()
205 users_cache = UsersCache()
206
207
208 def init_phab_connection():
209     phab = Phabricator()
210     phab.update_interfaces()
211     return phab
212
213
214 def update_cached_info(phab, cache, phab_query, order, record_results,
215                        max_nr_entries_per_fetch, max_nr_days_to_cache):
216     q = phab
217     LIMIT = max_nr_entries_per_fetch
218     for query_step in phab_query:
219         q = getattr(q, query_step)
220     results = q(order=order, limit=LIMIT)
221     most_recent_info, oldest_info = record_results(cache, results, phab)
222     oldest_info_to_fetch = datetime.fromtimestamp(most_recent_info) - \
223         timedelta(days=max_nr_days_to_cache)
224     most_recent_info_overall = most_recent_info
225     cache.write_cache_to_disk()
226     after = results["cursor"]["after"]
227     print("after: {0!r}".format(after))
228     print("most_recent_info: {0}".format(
229         datetime.fromtimestamp(most_recent_info)))
230     while (after is not None
231            and datetime.fromtimestamp(oldest_info) > oldest_info_to_fetch):
232         need_more_older_data = \
233             (cache.oldest_info is None or
234              datetime.fromtimestamp(cache.oldest_info) > oldest_info_to_fetch)
235         print(("need_more_older_data={0} cache.oldest_info={1} " +
236                "oldest_info_to_fetch={2}").format(
237                    need_more_older_data,
238                    datetime.fromtimestamp(cache.oldest_info)
239                    if cache.oldest_info is not None else None,
240                    oldest_info_to_fetch))
241         need_more_newer_data = \
242             (cache.most_recent_info is None or
243              cache.most_recent_info < most_recent_info)
244         print(("need_more_newer_data={0} cache.most_recent_info={1} " +
245                "most_recent_info={2}")
246               .format(need_more_newer_data, cache.most_recent_info,
247                       most_recent_info))
248         if not need_more_older_data and not need_more_newer_data:
249             break
250         results = q(order=order, after=after, limit=LIMIT)
251         most_recent_info, oldest_info = record_results(cache, results, phab)
252         after = results["cursor"]["after"]
253         print("after: {0!r}".format(after))
254         print("most_recent_info: {0}".format(
255             datetime.fromtimestamp(most_recent_info)))
256         cache.write_cache_to_disk()
257     cache.most_recent_info = most_recent_info_overall
258     if after is None:
259         # We did fetch all records. Mark the cache to contain all info since
260         # the start of time.
261         oldest_info = 0
262     cache.oldest_info = oldest_info
263     cache.write_cache_to_disk()
264
265
266 def record_reviews(cache, reviews, phab):
267     most_recent_info = None
268     oldest_info = None
269     for reviewInfo in reviews["data"]:
270         if reviewInfo["type"] != "DREV":
271             continue
272         id = reviewInfo["id"]
273         # phid = reviewInfo["phid"]
274         dateModified = int(reviewInfo["fields"]["dateModified"])
275         dateCreated = int(reviewInfo["fields"]["dateCreated"])
276         title = reviewInfo["fields"]["title"]
277         author = reviewInfo["fields"]["authorPHID"]
278         phabReview = cache.get(id)
279         if "dateModified" not in phabReview.__dict__ or \
280            dateModified > phabReview.dateModified:
281             diff_results = phab.differential.querydiffs(revisionIDs=[id])
282             diff_ids = sorted(diff_results.keys())
283             phabDiffs = []
284             for diff_id in diff_ids:
285                 diffInfo = diff_results[diff_id]
286                 d = PhabDiff(diff_id)
287                 d.update(diffInfo)
288                 phabDiffs.append(d)
289             phabReview.update(title, dateCreated, dateModified, author)
290             phabReview.setPhabDiffs(phabDiffs)
291             print("Updated D{0} modified on {1} ({2} diffs)".format(
292                 id, datetime.fromtimestamp(dateModified), len(phabDiffs)))
293
294         if most_recent_info is None:
295             most_recent_info = dateModified
296         elif most_recent_info < dateModified:
297             most_recent_info = dateModified
298
299         if oldest_info is None:
300             oldest_info = dateModified
301         elif oldest_info > dateModified:
302             oldest_info = dateModified
303     return most_recent_info, oldest_info
304
305
306 def record_users(cache, users, phab):
307     most_recent_info = None
308     oldest_info = None
309     for info in users["data"]:
310         if info["type"] != "USER":
311             continue
312         id = info["id"]
313         phid = info["phid"]
314         dateModified = int(info["fields"]["dateModified"])
315         # dateCreated = int(info["fields"]["dateCreated"])
316         realName = info["fields"]["realName"]
317         phabUser = cache.get(id)
318         phabUser.update(phid, realName)
319         if most_recent_info is None:
320             most_recent_info = dateModified
321         elif most_recent_info < dateModified:
322             most_recent_info = dateModified
323         if oldest_info is None:
324             oldest_info = dateModified
325         elif oldest_info > dateModified:
326             oldest_info = dateModified
327     return most_recent_info, oldest_info
328
329
330 PHABCACHESINFO = ((reviews_cache, ("differential", "revision", "search"),
331                    "updated", record_reviews, 5, 7),
332                   (users_cache, ("user", "search"), "newest", record_users,
333                    100, 1000))
334
335
336 def load_cache():
337     for cache, phab_query, order, record_results, _, _ in PHABCACHESINFO:
338         cache.populate_cache_from_disk()
339         print("Loaded {0} nr entries: {1}".format(
340             cache.get_name(), len(cache.get_ids_in_cache())))
341         print("Loaded {0} has most recent info: {1}".format(
342             cache.get_name(),
343             datetime.fromtimestamp(cache.most_recent_info)
344             if cache.most_recent_info is not None else None))
345
346
347 def update_cache(phab):
348     load_cache()
349     for cache, phab_query, order, record_results, max_nr_entries_per_fetch, \
350             max_nr_days_to_cache in PHABCACHESINFO:
351         update_cached_info(phab, cache, phab_query, order, record_results,
352                            max_nr_entries_per_fetch, max_nr_days_to_cache)
353         ids_in_cache = cache.get_ids_in_cache()
354         print("{0} objects in {1}".format(len(ids_in_cache), cache.get_name()))
355         cache.write_cache_to_disk()
356
357
358 def get_most_recent_reviews(days):
359     newest_reviews = sorted(
360         reviews_cache.get_objects(), key=lambda r: -r.dateModified)
361     if len(newest_reviews) == 0:
362         return newest_reviews
363     most_recent_review_time = \
364         datetime.fromtimestamp(newest_reviews[0].dateModified)
365     cut_off_date = most_recent_review_time - timedelta(days=days)
366     result = []
367     for review in newest_reviews:
368         if datetime.fromtimestamp(review.dateModified) < cut_off_date:
369             return result
370         result.append(review)
371     return result
372
373
374 # All of the above code is about fetching data from Phabricator and caching it
375 # on local disk. The below code contains the actual "business logic" for this
376 # script.
377
378 _userphid2realname = None
379
380
381 def get_real_name_from_author(user_phid):
382     global _userphid2realname
383     if _userphid2realname is None:
384         _userphid2realname = {}
385         for user in users_cache.get_objects():
386             _userphid2realname[user.phid] = user.realName
387     return _userphid2realname.get(user_phid, "unknown")
388
389
390 def print_most_recent_reviews(phab, days, filter_reviewers):
391     msgs = []
392
393     def add_msg(msg):
394         msgs.append(msg)
395         print(msg)
396
397     newest_reviews = get_most_recent_reviews(days)
398     add_msg("These are the reviews that look interesting to be reviewed. " +
399             "The report below has 2 sections. The first " +
400             "section is organized per review; the second section is organized "
401             + "per potential reviewer.\n")
402     oldest_review = newest_reviews[-1] if len(newest_reviews) > 0 else None
403     oldest_datetime = \
404         datetime.fromtimestamp(oldest_review.dateModified) \
405         if oldest_review else None
406     add_msg(("The report below is based on analyzing the reviews that got " +
407              "touched in the past {0} days (since {1}). " +
408              "The script found {2} such reviews.\n").format(
409                  days, oldest_datetime, len(newest_reviews)))
410     reviewer2reviews_and_scores = {}
411     for i, review in enumerate(newest_reviews):
412         matched_reviewers = find_reviewers_for_review(review)
413         matched_reviewers = filter_reviewers(matched_reviewers)
414         if len(matched_reviewers) == 0:
415             continue
416         add_msg(("{0:>3}. https://reviews.llvm.org/D{1} by {2}\n     {3}\n" +
417                  "     Last updated on {4}").format(
418                      i, review.id,
419                      get_real_name_from_author(review.author), review.title,
420                      datetime.fromtimestamp(review.dateModified)))
421         for reviewer, scores in matched_reviewers:
422             add_msg("    potential reviewer {0}, score {1}".format(
423                 reviewer,
424                 "(" + "/".join(["{0:.1f}%".format(s) for s in scores]) + ")"))
425             if reviewer not in reviewer2reviews_and_scores:
426                 reviewer2reviews_and_scores[reviewer] = []
427             reviewer2reviews_and_scores[reviewer].append((review, scores))
428
429     # Print out a summary per reviewer.
430     for reviewer in sorted(reviewer2reviews_and_scores.keys()):
431         reviews_and_scores = reviewer2reviews_and_scores[reviewer]
432         reviews_and_scores.sort(key=lambda rs: rs[1], reverse=True)
433         add_msg("\n\nSUMMARY FOR {0} (found {1} reviews):".format(
434             reviewer, len(reviews_and_scores)))
435         for review, scores in reviews_and_scores:
436             add_msg("[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format(
437                 "/".join(["{0:.1f}%".format(s) for s in scores]), review.id,
438                 review.title, get_real_name_from_author(review.author)))
439     return "\n".join(msgs)
440
441
442 def get_git_cmd_output(cmd):
443     output = None
444     try:
445         logging.debug(cmd)
446         output = subprocess.check_output(
447             cmd, shell=True, stderr=subprocess.STDOUT)
448     except subprocess.CalledProcessError as e:
449         logging.debug(str(e))
450     if output is None:
451         return None
452     return output.decode("utf-8", errors='ignore')
453
454
455 reAuthorMail = re.compile("^author-mail <([^>]*)>.*$")
456
457
458 def parse_blame_output_line_porcelain(blame_output):
459     email2nr_occurences = {}
460     if blame_output is None:
461         return email2nr_occurences
462     for line in blame_output.split('\n'):
463         m = reAuthorMail.match(line)
464         if m:
465             author_email_address = m.group(1)
466             if author_email_address not in email2nr_occurences:
467                 email2nr_occurences[author_email_address] = 1
468             else:
469                 email2nr_occurences[author_email_address] += 1
470     return email2nr_occurences
471
472
473 def find_reviewers_for_diff_heuristic(diff):
474     # Heuristic 1: assume good reviewers are the ones that touched the same
475     # lines before as this patch is touching.
476     # Heuristic 2: assume good reviewers are the ones that touched the same
477     # files before as this patch is touching.
478     reviewers2nr_lines_touched = {}
479     reviewers2nr_files_touched = {}
480     # Assume last revision before diff was modified is the revision the diff
481     # applies to.
482     git_repo = "git_repos/llvm"
483     cmd = 'git -C {0} rev-list -n 1 --before="{1}" master'.format(
484         git_repo,
485         datetime.fromtimestamp(
486             diff.dateModified).strftime("%Y-%m-%d %H:%M:%s"))
487     base_revision = get_git_cmd_output(cmd).strip()
488     logging.debug("Base revision={0}".format(base_revision))
489     for change in diff.changes:
490         path = change.oldPath
491         # Compute heuristic 1: look at context of patch lines.
492         for hunk in change.hunks:
493             for start_line, end_line in hunk.actual_lines_changed_offset:
494                 # Collect git blame results for authors in those ranges.
495                 cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e " +
496                        "-w --line-porcelain -L {1},{2} {3} -- {4}").format(
497                            git_repo, start_line, end_line, base_revision, path)
498                 blame_output = get_git_cmd_output(cmd)
499                 for reviewer, nr_occurences in \
500                         parse_blame_output_line_porcelain(blame_output).items():
501                     if reviewer not in reviewers2nr_lines_touched:
502                         reviewers2nr_lines_touched[reviewer] = 0
503                     reviewers2nr_lines_touched[reviewer] += nr_occurences
504         # Compute heuristic 2: don't look at context, just at files touched.
505         # Collect git blame results for authors in those ranges.
506         cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " +
507                "--line-porcelain {1} -- {2}").format(git_repo, base_revision,
508                                                      path)
509         blame_output = get_git_cmd_output(cmd)
510         for reviewer, nr_occurences in parse_blame_output_line_porcelain(
511                 blame_output).items():
512             if reviewer not in reviewers2nr_files_touched:
513                 reviewers2nr_files_touched[reviewer] = 0
514             reviewers2nr_files_touched[reviewer] += 1
515
516     # Compute "match scores"
517     total_nr_lines = sum(reviewers2nr_lines_touched.values())
518     total_nr_files = len(diff.changes)
519     reviewers_matchscores = \
520         [(reviewer,
521           (reviewers2nr_lines_touched.get(reviewer, 0)*100.0/total_nr_lines
522            if total_nr_lines != 0 else 0,
523            reviewers2nr_files_touched[reviewer]*100.0/total_nr_files
524            if total_nr_files != 0 else 0))
525          for reviewer, nr_lines
526          in reviewers2nr_files_touched.items()]
527     reviewers_matchscores.sort(key=lambda i: i[1], reverse=True)
528     return reviewers_matchscores
529
530
531 def find_reviewers_for_review(review):
532     # Process the newest diff first.
533     diffs = sorted(
534         review.phabDiffs, key=lambda d: d.dateModified, reverse=True)
535     if len(diffs) == 0:
536         return
537     diff = diffs[0]
538     matched_reviewers = find_reviewers_for_diff_heuristic(diff)
539     # Show progress, as this is a slow operation:
540     sys.stdout.write('.')
541     sys.stdout.flush()
542     logging.debug("matched_reviewers: {0}".format(matched_reviewers))
543     return matched_reviewers
544
545
546 def update_git_repos():
547     git_repos_directory = "git_repos"
548     for name, url in GIT_REPO_METADATA:
549         dirname = os.path.join(git_repos_directory, name)
550         if not os.path.exists(dirname):
551             cmd = "git clone {0} {1}".format(url, dirname)
552             output = get_git_cmd_output(cmd)
553         cmd = "git -C {0} pull --rebase".format(dirname)
554         output = get_git_cmd_output(cmd)
555
556
557 def send_emails(email_addresses, msg):
558     s = smtplib.SMTP()
559     s.connect()
560     for email_address in email_addresses:
561         email_msg = email.mime.multipart.MIMEMultipart()
562         email_msg['From'] = ''
563         email_msg['To'] = email_address
564         email_msg['Subject'] = 'LLVM patches you may be able to review.'
565         email_msg.attach(email.mime.text.MIMEText(msg, 'plain'))
566         # python 3.x: s.send_message(email_msg)
567         s.sendmail(email_msg['From'], email_msg['To'], msg)
568     s.quit()
569
570
571 def filter_reviewers_to_report_for(people_to_look_for):
572     # The below is just an example filter, to only report potential reviews
573     # to do for the people that will receive the report email.
574     return lambda potential_reviewers: [r for r in potential_reviewers
575                                         if r[0] in people_to_look_for]
576
577
578 def main():
579     parser = argparse.ArgumentParser(
580         description='Match open reviews to potential reviewers.')
581     parser.add_argument(
582         '--no-update-cache',
583         dest='update_cache',
584         action='store_false',
585         default=True,
586         help='Do not update cached Phabricator objects')
587     parser.add_argument(
588         'email_addresses',
589         nargs='*',
590         help="The email addresses (as known by LLVM git) of " +
591         "the people to look for reviews for.")
592     parser.add_argument('--verbose', '-v', action='count')
593
594     args = parser.parse_args()
595
596     if args.verbose >= 1:
597         logging.basicConfig(level=logging.DEBUG)
598
599     people_to_look_for = [e.decode('utf-8') for e in args.email_addresses]
600
601     phab = init_phab_connection()
602
603     if args.update_cache:
604         update_cache(phab)
605
606     load_cache()
607     update_git_repos()
608     msg = print_most_recent_reviews(
609         phab,
610         days=1,
611         filter_reviewers=filter_reviewers_to_report_for(people_to_look_for))
612     send_emails(people_to_look_for, msg)
613
614
615 if __name__ == "__main__":
616     main()