Skip to content

Commit

Permalink
Git clone shallow to improve new importer instance performance (#765)
Browse files Browse the repository at this point in the history
A shallow clone that only clones commits up to the previous synced date.
This avoids having to clone the entire repository history when unneeded.
A pull/fetch will only fetch future commits, without needing to fill up
the entire history.

A potential issue is if default branch changes to a completely different
branch (not just renamed), this will mean potentially needing more
history than what is available since the last sync time. This is very
rare and I am not aware of any reason for doing this.

Fixes #576
  • Loading branch information
another-rex committed Oct 5, 2022
1 parent 6ce5ca7 commit 536de61
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 16 deletions.
2 changes: 2 additions & 0 deletions docker/importer/importer.py
Expand Up @@ -134,6 +134,7 @@ def checkout(self, source_repo):
return osv.ensure_updated_checkout(
source_repo.repo_url,
os.path.join(self._sources_dir, source_repo.name),
last_update_date=source_repo.last_update_date,
git_callbacks=self._git_callbacks(source_repo),
branch=source_repo.repo_branch)

Expand Down Expand Up @@ -312,6 +313,7 @@ def _process_updates_git(self, source_repo: osv.SourceRepository):
source_repo, original_sha256, deleted_entry, deleted=True)

source_repo.last_synced_hash = str(repo.head.target)
source_repo.last_update_date = utcnow().date()
source_repo.put()

logging.info("Finish processing git: %s", source_repo.name)
Expand Down
78 changes: 62 additions & 16 deletions osv/repos.py
Expand Up @@ -12,12 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Repo functions."""

import datetime
import logging
import os
import shutil
import subprocess
import time
from typing import Optional

import pygit2

Expand Down Expand Up @@ -81,8 +82,20 @@ def _checkout_branch(repo, branch):
repo.reset(remote_branch.target, pygit2.GIT_RESET_HARD)


def clone(git_url, checkout_dir, git_callbacks=None):
"""Perform a clone."""
def clone(git_url: str,
checkout_dir: str,
git_callbacks: Optional[datetime.datetime] = None,
last_update_date: Optional[datetime.datetime] = None):
"""
Perform a clone.
:param git_url: git URL
:param checkout_dir: checkout directory
:param git_callbacks: Used for git to retrieve credentials when pulling.
See `GitRemoteCallback`
:param last_update_date: Optional python datetime object used to specify
the date of the shallow clone.
"""
# Use 'git' CLI here as it's much faster than libgit2's clone.
env = {}
if git_callbacks:
Expand All @@ -91,19 +104,36 @@ def clone(git_url, checkout_dir, git_callbacks=None):
f'-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null '
f'-o User={git_callbacks.username} -o IdentitiesOnly=yes')

subprocess.check_call(
['git', 'clone', _git_mirror(git_url), checkout_dir],
env=env,
stderr=subprocess.STDOUT)
call_args = ['git', 'clone', _git_mirror(git_url), checkout_dir]
if last_update_date:
# Clone from 1 day prior to be safe and avoid any off by 1 errors
shallow_since_date = (last_update_date -
datetime.timedelta(days=1)).strftime('%Y-%m-%d')
call_args.extend(['--shallow-since=' + shallow_since_date])

subprocess.check_call(call_args, env=env, stderr=subprocess.STDOUT)
return pygit2.Repository(checkout_dir)


def clone_with_retries(git_url, checkout_dir, git_callbacks=None, branch=None):
"""Clone with retries."""
def clone_with_retries(git_url: str,
checkout_dir: str,
last_update_date: Optional[datetime.datetime] = None,
git_callbacks: Optional[datetime.datetime] = None,
branch: Optional[str] = None):
"""Clone with retries.
Number of retries is defined in the CLONE_TRIES constant.
:param git_url: git URL
:param checkout_dir: checkout directory
:param git_callbacks: Used for git to retrieve credentials when pulling.
See `GitRemoteCallback`
:param last_update_date: Optional python datetime object used to specify
the date of the shallow clone.
"""
logging.info('Cloning %s to %s', git_url, checkout_dir)
for _ in range(CLONE_TRIES):
try:
repo = clone(git_url, checkout_dir, git_callbacks)
repo = clone(git_url, checkout_dir, git_callbacks, last_update_date)
repo.cache = {}
if branch:
_checkout_branch(repo, branch)
Expand Down Expand Up @@ -135,11 +165,22 @@ def _use_existing_checkout(git_url,
return repo


def ensure_updated_checkout(git_url,
checkout_dir,
git_callbacks=None,
branch=None):
"""Ensure updated checkout."""
def ensure_updated_checkout(
git_url: str,
checkout_dir: str,
last_update_date: Optional[datetime.datetime] = None,
git_callbacks: Optional[pygit2.RemoteCallbacks] = None,
branch: Optional[str] = None):
"""Ensure updated checkout.
:param git_url: git URL
:param checkout_dir: checkout directory
:param git_callbacks: Used for git to retrieve credentials when pulling.
See `GitRemoteCallback`
:param last_update_date: Optional python datetime object used to specify
the date of the shallow clone. If the repository already exists, this
argument will be ignored, and new commits pulled down.
"""
if os.path.exists(checkout_dir):
# Already exists, reset and checkout latest revision.
try:
Expand All @@ -151,7 +192,12 @@ def ensure_updated_checkout(git_url,
shutil.rmtree(checkout_dir)

repo = clone_with_retries(
git_url, checkout_dir, git_callbacks=git_callbacks, branch=branch)
git_url,
checkout_dir,
last_update_date,
git_callbacks=git_callbacks,
branch=branch)

logging.info('Repo now at: %s', repo.head.peel().message)
return repo

Expand Down

0 comments on commit 536de61

Please sign in to comment.