1 files changed, 138 insertions, 38 deletions
diff --git a/infra/git_mirror_bot/mirror_aosp_to_ghub_repo.py b/infra/git_mirror_bot/mirror_aosp_to_ghub_repo.py
index fa1738237..53e27cea0 100644
--- a/infra/git_mirror_bot/mirror_aosp_to_ghub_repo.py
+++ b/infra/git_mirror_bot/mirror_aosp_to_ghub_repo.py
@@ -12,18 +12,45 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Mirrors a Gerrit repo into GitHub.
 
-Mirrors all the branches (refs/heads/foo) from Gerrit to Github as-is, taking
-care of propagating also deletions.
+""" Mirrors a Gerrit repo into GitHub, turning CLs into individual branches.
+
+This script does a bit of git black magic. It does mainly two things:
+1) Mirrors all the branches (refs/heads/foo) from Gerrit to Github as-is, taking
+   care of propagating also deletions.
+2) Rewrites Gerrit CLs (refs/changes/NN/cl_number/patchset_number) as
+   Github branches (refs/heads/cl_number) recreating a linear chain of commits
+   for each patchset in any given CL.
+
+2. Is the trickier part. The problem is that Gerrit stores each patchset of
+each CL as an independent ref, e.g.:
+  $ git ls-remote origin
+  94df12f950462b55a2257b89d1fad6fac24353f9	refs/changes/10/496410/1
+  4472fadddf8def74fd76a66ff373ca1245c71bcc	refs/changes/10/496410/2
+  90b8535da0653d8f072e86cef9891a664f4e9ed7	refs/changes/10/496410/3
+  2149c215fa9969bb454f23ce355459f28604c545	refs/changes/10/496410/meta
+
+  53db7261268802648d7f6125ae6242db17e7a60d	refs/changes/20/494620/1
+  d25e56930486363e0637b0a9debe3ae3ec805207	refs/changes/20/494620/2
+
+Where each ref is base on top of the master branch (or whatever the dev choose).
+On GitHub, instead, we want to recreate something similar to the pull-request
+model, ending up with one branch per CL, and one commit per patchset.
+Also we want to make them non-hidden branch heads (i.e. in the refs/heads/)
+name space, because Travis CI does not hooks hidden branches.
+In conclusion we want to transform the above into:
+
+refs/changes/496410
+  * commit: [CL 496410, Patchset 3] (parent: [CL 496410, Patchset 2])
+  * commit: [CL 496410, Patchset 2] (parent: [CL 496410, Patchset 1])
+  * commit: [CL 496410, Patchset 1] (parent: [master])
+refs/changes/496420
+  * commit: [CL 496420, Patchset 2] (parent: [CL 496420, Patchset 1])
+  * commit: [CL 496420, Patchset 1] (parent: [master])
 
-This script used to be more complex, turning all the Gerrit CLs
-(refs/changes/NN/cl_number/patchset_number) into Github branches
-(refs/heads/cl_number). This use case was dropped as we moved away from Travis.
-See the git history of this file for more.
 """
 
-import argparse
+import collections
 import logging
 import os
 import re
@@ -31,12 +58,27 @@ import shutil
 import subprocess
 import sys
 import time
+import traceback
+
+from multiprocessing.pool import ThreadPool
 
 CUR_DIR = os.path.dirname(os.path.abspath(__file__))
 GIT_UPSTREAM = 'https://android.googlesource.com/platform/external/perfetto/'
 GIT_MIRROR = 'git@github.com:catapult-project/perfetto.git'
 WORKDIR = os.path.join(CUR_DIR, 'repo')
 
+# Ignores CLs that have a cumulative tree size greater than this. GitHub rightly
+# refuses to accept commits that have files that are too big, suggesting to use
+# LFS instead.
+MAX_TREE_SIZE_MB = 50
+
+# Ignores all CL numbers < this. 913796 roughly maps to end of Feb 2019.
+MIN_CL_NUM = 913796
+
+# Max number of concurrent git subprocesses that can be run while generating
+# per-CL branches.
+GIT_SUBPROCESS_CONCURRENCY = 10
+
 # Min delay (in seconds) between two consecutive git poll cycles. This is to
 # avoid hitting gerrit API quota limits.
 POLL_PERIOD_SEC = 60
@@ -48,22 +90,16 @@ ENV = {'GIT_SSH_COMMAND': 'ssh -i ' + os.path.join(CUR_DIR, 'deploy_key')}
 def GitCmd(*args, **kwargs):
   cmd = ['git'] + list(args)
   p = subprocess.Popen(
-      cmd,
-      stdin=subprocess.PIPE,
-      stdout=subprocess.PIPE,
-      stderr=sys.stderr,
-      cwd=WORKDIR,
-      env=ENV)
+      cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=sys.stderr,
+      cwd=WORKDIR, env=ENV)
   out = p.communicate(kwargs.get('stdin'))[0]
   assert p.returncode == 0, 'FAIL: ' + ' '.join(cmd)
   return out
 
 
 # Create a git repo that mirrors both the upstream and the mirror repos.
-def Setup(args):
+def Setup():
   if os.path.exists(WORKDIR):
-    if args.no_clean:
-      return
     shutil.rmtree(WORKDIR)
   os.makedirs(WORKDIR)
   GitCmd('init', '--bare', '--quiet')
@@ -72,12 +108,64 @@ def Setup(args):
   GitCmd('remote', 'add', 'mirror', GIT_MIRROR, '--mirror=fetch')
 
 
-def Sync(args):
+# Returns the SUM(file.size) for file in the given git tree.
+def GetTreeSize(tree_sha1):
+  raw = GitCmd('ls-tree', '-r', '--long', tree_sha1)
+  return sum(int(line.split()[3]) for line in raw.splitlines())
+
+
+def GetCommit(commit_sha1):
+  raw = GitCmd('cat-file', 'commit', commit_sha1)
+  return {
+    'tree': re.search(r'^tree\s(\w+)$', raw, re.M).group(1),
+    'parent': re.search(r'^parent\s(\w+)$', raw, re.M).group(1),
+    'author': re.search(r'^author\s(.+)$', raw, re.M).group(1),
+    'committer': re.search(r'^committer\s(.+)$', raw, re.M).group(1),
+    'message': re.search(r'\n\n(.+)', raw, re.M | re.DOTALL).group(1),
+  }
+
+
+def ForgeCommit(tree, parent, author, committer, message):
+  raw = 'tree %s\nparent %s\nauthor %s\ncommitter %s\n\n%s' % (
+      tree, parent, author, committer, message)
+  out = GitCmd('hash-object', '-w', '-t', 'commit', '--stdin', stdin=raw)
+  return out.strip()
+
+
+# Translates a CL, identified by a (Gerrit) CL number and a list of patchsets
+# into a git branch, where all patchsets look like subsequent commits.
+# This function must be stateless and idempotent, it's invoked by ThreadPool.
+def TranslateClIntoBranch(packed_args):
+  cl_num, patchsets = packed_args
+  if cl_num < MIN_CL_NUM:
+    return
+  parent_sha1 = None
+  for patchset_num, commit_sha1 in sorted(patchsets.items(), key=lambda x:x[0]):
+    patchset_data = GetCommit(commit_sha1)
+    # Skip Cls that are too big as they would be rejected by GitHub.
+    tree_size_bytes = GetTreeSize(patchset_data['tree'])
+    if tree_size_bytes > MAX_TREE_SIZE_MB * (1 << 20):
+      logging.warning('Skipping CL %s because its too big (%d bytes)',
+                      cl_num, tree_size_bytes)
+      return
+    parent_sha1 = parent_sha1 or patchset_data['parent']
+    forged_sha1 = ForgeCommit(
+        tree=patchset_data['tree'],
+        parent=parent_sha1,
+        author=patchset_data['author'],
+        committer=patchset_data['committer'],
+        message='[Patchset %d] %s' % (patchset_num, patchset_data['message']))
+    parent_sha1 = forged_sha1
+    return 'refs/heads/changes/%d' % cl_num, forged_sha1
+
+
+def Sync():
   logging.info('Fetching git remotes')
   GitCmd('fetch', '--all', '--quiet')
   all_refs = GitCmd('show-ref')
   future_heads = {}
   current_heads = {}
+  changes = collections.defaultdict(dict)
 
   # List all refs from both repos and:
   # 1. Keep track of all branch heads refnames and sha1s from the (github)
@@ -102,6 +190,27 @@ def Sync(args):
       future_heads['refs/heads/' + branch] = ref_sha1
       continue
 
+    PREFIX = 'refs/remotes/upstream/changes/'
+    if ref.startswith(PREFIX):
+      (_, cl_num, patchset) = ref[len(PREFIX):].split('/')
+      if not cl_num.isdigit() or not patchset.isdigit():
+        continue
+      cl_num, patchset = int(cl_num), int(patchset)
+      changes[cl_num][patchset] = ref_sha1
+
+  # Now iterate over the upstream (AOSP) CLS and forge a chain of commits,
+  # creating one branch refs/heads/changes/cl_number for each set of patchsets.
+  # Forging commits is mostly fork() + exec() and I/O bound, parallelism helps
+  # significantly to hide those latencies.
+  logging.info('Forging per-CL branches')
+  pool = ThreadPool(processes=GIT_SUBPROCESS_CONCURRENCY)
+  for res in pool.imap_unordered(TranslateClIntoBranch, changes.iteritems()):
+    if res is None:
+      continue
+    branch_ref, forged_sha1 = res
+    future_heads[branch_ref] = forged_sha1
+  pool.close()
+
   deleted_heads = set(current_heads) - set(future_heads)
   logging.info('current_heads: %d, future_heads: %d, deleted_heads: %d',
                len(current_heads), len(future_heads), len(deleted_heads))
@@ -116,37 +225,28 @@ def Sync(args):
   for ref_to_update, ref_sha1 in future_heads.iteritems():
     if current_heads.get(ref_to_update) != ref_sha1:
       update_ref_cmd += 'update %s %s\n' % (ref_to_update, ref_sha1)
+  print update_ref_cmd
 
+  logging.info('Pushing updates')
+  # Update objects and push.
   GitCmd('update-ref', '--stdin', stdin=update_ref_cmd)
-
-  if args.push:
-    logging.info('Pushing updates')
-    GitCmd('push', 'mirror', '--all', '--prune', '--force')
-    GitCmd('gc', '--prune=all', '--aggressive', '--quiet')
-  else:
-    logging.info('Dry-run mode, skipping git push. Pass --push for prod mode.')
+  GitCmd('push', 'mirror', '--all', '--prune', '--force')
+  GitCmd('gc', '--prune=all', '--aggressive', '--quiet')
 
 
 def Main():
-  parser = argparse.ArgumentParser()
-  parser.add_argument('--push', default=False, action='store_true')
-  parser.add_argument('--no-clean', default=False, action='store_true')
-  parser.add_argument('-v', dest='verbose', default=False, action='store_true')
-  args = parser.parse_args()
-
-  logging.basicConfig(
-      format='%(asctime)s %(levelname)-8s %(message)s',
-      level=logging.DEBUG if args.verbose else logging.INFO,
-      datefmt='%Y-%m-%d %H:%M:%S')
-
   logging.info('Setting up git repo one-off')
-  Setup(args)
+  Setup()
   while True:
     logging.info('------- BEGINNING OF SYNC CYCLE -------')
-    Sync(args)
+    Sync()
     logging.info('------- END OF SYNC CYCLE -------')
     time.sleep(POLL_PERIOD_SEC)
 
 
 if __name__ == '__main__':
+  logging.basicConfig(
+    format='%(asctime)s %(levelname)-8s %(message)s',
+    level=logging.INFO,
+    datefmt='%Y-%m-%d %H:%M:%S')
   sys.exit(Main())