aboutsummaryrefslogtreecommitdiff
path: root/infra/git_mirror_bot/mirror_aosp_to_ghub_repo.py
diff options
context:
space:
mode:
Diffstat (limited to 'infra/git_mirror_bot/mirror_aosp_to_ghub_repo.py')
-rw-r--r--infra/git_mirror_bot/mirror_aosp_to_ghub_repo.py176
1 files changed, 138 insertions, 38 deletions
diff --git a/infra/git_mirror_bot/mirror_aosp_to_ghub_repo.py b/infra/git_mirror_bot/mirror_aosp_to_ghub_repo.py
index fa1738237..53e27cea0 100644
--- a/infra/git_mirror_bot/mirror_aosp_to_ghub_repo.py
+++ b/infra/git_mirror_bot/mirror_aosp_to_ghub_repo.py
@@ -12,18 +12,45 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-""" Mirrors a Gerrit repo into GitHub.
-Mirrors all the branches (refs/heads/foo) from Gerrit to Github as-is, taking
-care of propagating also deletions.
+""" Mirrors a Gerrit repo into GitHub, turning CLs into individual branches.
+
+This script does a bit of git black magic. It does mainly two things:
+1) Mirrors all the branches (refs/heads/foo) from Gerrit to Github as-is, taking
+ care of propagating also deletions.
+2) Rewrites Gerrit CLs (refs/changes/NN/cl_number/patchset_number) as
+ Github branches (refs/heads/cl_number) recreating a linear chain of commits
+ for each patchset in any given CL.
+
+2. Is the trickier part. The problem is that Gerrit stores each patchset of
+each CL as an independent ref, e.g.:
+ $ git ls-remote origin
+ 94df12f950462b55a2257b89d1fad6fac24353f9 refs/changes/10/496410/1
+ 4472fadddf8def74fd76a66ff373ca1245c71bcc refs/changes/10/496410/2
+ 90b8535da0653d8f072e86cef9891a664f4e9ed7 refs/changes/10/496410/3
+ 2149c215fa9969bb454f23ce355459f28604c545 refs/changes/10/496410/meta
+
+ 53db7261268802648d7f6125ae6242db17e7a60d refs/changes/20/494620/1
+ d25e56930486363e0637b0a9debe3ae3ec805207 refs/changes/20/494620/2
+
+Where each ref is base on top of the master branch (or whatever the dev choose).
+On GitHub, instead, we want to recreate something similar to the pull-request
+model, ending up with one branch per CL, and one commit per patchset.
+Also we want to make them non-hidden branch heads (i.e. in the refs/heads/)
+name space, because Travis CI does not hooks hidden branches.
+In conclusion we want to transform the above into:
+
+refs/changes/496410
+ * commit: [CL 496410, Patchset 3] (parent: [CL 496410, Patchset 2])
+ * commit: [CL 496410, Patchset 2] (parent: [CL 496410, Patchset 1])
+ * commit: [CL 496410, Patchset 1] (parent: [master])
+refs/changes/496420
+ * commit: [CL 496420, Patchset 2] (parent: [CL 496420, Patchset 1])
+ * commit: [CL 496420, Patchset 1] (parent: [master])
-This script used to be more complex, turning all the Gerrit CLs
-(refs/changes/NN/cl_number/patchset_number) into Github branches
-(refs/heads/cl_number). This use case was dropped as we moved away from Travis.
-See the git history of this file for more.
"""
-import argparse
+import collections
import logging
import os
import re
@@ -31,12 +58,27 @@ import shutil
import subprocess
import sys
import time
+import traceback
+
+from multiprocessing.pool import ThreadPool
CUR_DIR = os.path.dirname(os.path.abspath(__file__))
GIT_UPSTREAM = 'https://android.googlesource.com/platform/external/perfetto/'
GIT_MIRROR = 'git@github.com:catapult-project/perfetto.git'
WORKDIR = os.path.join(CUR_DIR, 'repo')
+# Ignores CLs that have a cumulative tree size greater than this. GitHub rightly
+# refuses to accept commits that have files that are too big, suggesting to use
+# LFS instead.
+MAX_TREE_SIZE_MB = 50
+
+# Ignores all CL numbers < this. 913796 roughly maps to end of Feb 2019.
+MIN_CL_NUM = 913796
+
+# Max number of concurrent git subprocesses that can be run while generating
+# per-CL branches.
+GIT_SUBPROCESS_CONCURRENCY = 10
+
# Min delay (in seconds) between two consecutive git poll cycles. This is to
# avoid hitting gerrit API quota limits.
POLL_PERIOD_SEC = 60
@@ -48,22 +90,16 @@ ENV = {'GIT_SSH_COMMAND': 'ssh -i ' + os.path.join(CUR_DIR, 'deploy_key')}
def GitCmd(*args, **kwargs):
cmd = ['git'] + list(args)
p = subprocess.Popen(
- cmd,
- stdin=subprocess.PIPE,
- stdout=subprocess.PIPE,
- stderr=sys.stderr,
- cwd=WORKDIR,
- env=ENV)
+ cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=sys.stderr,
+ cwd=WORKDIR, env=ENV)
out = p.communicate(kwargs.get('stdin'))[0]
assert p.returncode == 0, 'FAIL: ' + ' '.join(cmd)
return out
# Create a git repo that mirrors both the upstream and the mirror repos.
-def Setup(args):
+def Setup():
if os.path.exists(WORKDIR):
- if args.no_clean:
- return
shutil.rmtree(WORKDIR)
os.makedirs(WORKDIR)
GitCmd('init', '--bare', '--quiet')
@@ -72,12 +108,64 @@ def Setup(args):
GitCmd('remote', 'add', 'mirror', GIT_MIRROR, '--mirror=fetch')
-def Sync(args):
+# Returns the SUM(file.size) for file in the given git tree.
+def GetTreeSize(tree_sha1):
+ raw = GitCmd('ls-tree', '-r', '--long', tree_sha1)
+ return sum(int(line.split()[3]) for line in raw.splitlines())
+
+
+def GetCommit(commit_sha1):
+ raw = GitCmd('cat-file', 'commit', commit_sha1)
+ return {
+ 'tree': re.search(r'^tree\s(\w+)$', raw, re.M).group(1),
+ 'parent': re.search(r'^parent\s(\w+)$', raw, re.M).group(1),
+ 'author': re.search(r'^author\s(.+)$', raw, re.M).group(1),
+ 'committer': re.search(r'^committer\s(.+)$', raw, re.M).group(1),
+ 'message': re.search(r'\n\n(.+)', raw, re.M | re.DOTALL).group(1),
+ }
+
+
+def ForgeCommit(tree, parent, author, committer, message):
+ raw = 'tree %s\nparent %s\nauthor %s\ncommitter %s\n\n%s' % (
+ tree, parent, author, committer, message)
+ out = GitCmd('hash-object', '-w', '-t', 'commit', '--stdin', stdin=raw)
+ return out.strip()
+
+
+# Translates a CL, identified by a (Gerrit) CL number and a list of patchsets
+# into a git branch, where all patchsets look like subsequent commits.
+# This function must be stateless and idempotent, it's invoked by ThreadPool.
+def TranslateClIntoBranch(packed_args):
+ cl_num, patchsets = packed_args
+ if cl_num < MIN_CL_NUM:
+ return
+ parent_sha1 = None
+ for patchset_num, commit_sha1 in sorted(patchsets.items(), key=lambda x:x[0]):
+ patchset_data = GetCommit(commit_sha1)
+ # Skip Cls that are too big as they would be rejected by GitHub.
+ tree_size_bytes = GetTreeSize(patchset_data['tree'])
+ if tree_size_bytes > MAX_TREE_SIZE_MB * (1 << 20):
+ logging.warning('Skipping CL %s because its too big (%d bytes)',
+ cl_num, tree_size_bytes)
+ return
+ parent_sha1 = parent_sha1 or patchset_data['parent']
+ forged_sha1 = ForgeCommit(
+ tree=patchset_data['tree'],
+ parent=parent_sha1,
+ author=patchset_data['author'],
+ committer=patchset_data['committer'],
+ message='[Patchset %d] %s' % (patchset_num, patchset_data['message']))
+ parent_sha1 = forged_sha1
+ return 'refs/heads/changes/%d' % cl_num, forged_sha1
+
+
+def Sync():
logging.info('Fetching git remotes')
GitCmd('fetch', '--all', '--quiet')
all_refs = GitCmd('show-ref')
future_heads = {}
current_heads = {}
+ changes = collections.defaultdict(dict)
# List all refs from both repos and:
# 1. Keep track of all branch heads refnames and sha1s from the (github)
@@ -102,6 +190,27 @@ def Sync(args):
future_heads['refs/heads/' + branch] = ref_sha1
continue
+ PREFIX = 'refs/remotes/upstream/changes/'
+ if ref.startswith(PREFIX):
+ (_, cl_num, patchset) = ref[len(PREFIX):].split('/')
+ if not cl_num.isdigit() or not patchset.isdigit():
+ continue
+ cl_num, patchset = int(cl_num), int(patchset)
+ changes[cl_num][patchset] = ref_sha1
+
+ # Now iterate over the upstream (AOSP) CLS and forge a chain of commits,
+ # creating one branch refs/heads/changes/cl_number for each set of patchsets.
+ # Forging commits is mostly fork() + exec() and I/O bound, parallelism helps
+ # significantly to hide those latencies.
+ logging.info('Forging per-CL branches')
+ pool = ThreadPool(processes=GIT_SUBPROCESS_CONCURRENCY)
+ for res in pool.imap_unordered(TranslateClIntoBranch, changes.iteritems()):
+ if res is None:
+ continue
+ branch_ref, forged_sha1 = res
+ future_heads[branch_ref] = forged_sha1
+ pool.close()
+
deleted_heads = set(current_heads) - set(future_heads)
logging.info('current_heads: %d, future_heads: %d, deleted_heads: %d',
len(current_heads), len(future_heads), len(deleted_heads))
@@ -116,37 +225,28 @@ def Sync(args):
for ref_to_update, ref_sha1 in future_heads.iteritems():
if current_heads.get(ref_to_update) != ref_sha1:
update_ref_cmd += 'update %s %s\n' % (ref_to_update, ref_sha1)
+ print update_ref_cmd
+ logging.info('Pushing updates')
+ # Update objects and push.
GitCmd('update-ref', '--stdin', stdin=update_ref_cmd)
-
- if args.push:
- logging.info('Pushing updates')
- GitCmd('push', 'mirror', '--all', '--prune', '--force')
- GitCmd('gc', '--prune=all', '--aggressive', '--quiet')
- else:
- logging.info('Dry-run mode, skipping git push. Pass --push for prod mode.')
+ GitCmd('push', 'mirror', '--all', '--prune', '--force')
+ GitCmd('gc', '--prune=all', '--aggressive', '--quiet')
def Main():
- parser = argparse.ArgumentParser()
- parser.add_argument('--push', default=False, action='store_true')
- parser.add_argument('--no-clean', default=False, action='store_true')
- parser.add_argument('-v', dest='verbose', default=False, action='store_true')
- args = parser.parse_args()
-
- logging.basicConfig(
- format='%(asctime)s %(levelname)-8s %(message)s',
- level=logging.DEBUG if args.verbose else logging.INFO,
- datefmt='%Y-%m-%d %H:%M:%S')
-
logging.info('Setting up git repo one-off')
- Setup(args)
+ Setup()
while True:
logging.info('------- BEGINNING OF SYNC CYCLE -------')
- Sync(args)
+ Sync()
logging.info('------- END OF SYNC CYCLE -------')
time.sleep(POLL_PERIOD_SEC)
if __name__ == '__main__':
+ logging.basicConfig(
+ format='%(asctime)s %(levelname)-8s %(message)s',
+ level=logging.INFO,
+ datefmt='%Y-%m-%d %H:%M:%S')
sys.exit(Main())