From 955584152fe87c6be88f0f3d48b6a27810030912 Mon Sep 17 00:00:00 2001 From: Tom Prince Date: Mon, 14 Feb 2022 10:39:36 -0700 Subject: [PATCH 1/4] Use released version of nix. --- .github/workflows/update.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/update.yml b/.github/workflows/update.yml index b5bcf6108b4..06c64f6c118 100644 --- a/.github/workflows/update.yml +++ b/.github/workflows/update.yml @@ -20,11 +20,10 @@ jobs: fetch-depth: 1 - name: Install/Setup - NIX - uses: cachix/install-nix-action@v13 + uses: cachix/install-nix-action@v16 with: nix_path: nixpkgs=channel:nixos-unstable - install_url: https://nixos-nix-install-tests.cachix.org/serve/i41jvy44n7vlgwlvyvii49zzjrl6x9z1/install - install_options: '--tarball-url-prefix https://nixos-nix-install-tests.cachix.org/serve' + install_url: https://releases.nixos.org/nix/nix-2.6.0/install # GC 30GB when free space < 3GB extra_nix_config: | experimental-features = nix-command flakes From 1c1d4fdf7b9ed88d8785f2b19977165b8930e640 Mon Sep 17 00:00:00 2001 From: Tom Prince Date: Mon, 14 Feb 2022 10:47:50 -0700 Subject: [PATCH 2/4] Remove unsupported versions of python. In particular, the current code of the mach-nix pep517-metadata branch doesn't support python2.7. --- flake.lock | 16 ---------------- flake.nix | 22 +++++++--------------- 2 files changed, 7 insertions(+), 31 deletions(-) diff --git a/flake.lock b/flake.lock index 9a2ea0668ab..9c2d8b632a5 100644 --- a/flake.lock +++ b/flake.lock @@ -50,21 +50,6 @@ "type": "github" } }, - "nixpkgsPy36": { - "locked": { - "lastModified": 1601475821, - "narHash": "sha256-7AI8j/xq5slauMGwC3Dp2K9TKDyDtBXBebeyWsE9euE=", - "owner": "NixOS", - "repo": "nixpkgs", - "rev": "b4db68ff563895eea6aab4ff24fa04ef403dfe14", - "type": "github" - }, - "original": { - "id": "nixpkgs", - "rev": "b4db68ff563895eea6aab4ff24fa04ef403dfe14", - "type": "indirect" - } - }, "nixpkgs_2": { "locked": { "lastModified": 1618619705, @@ -116,7 +101,6 @@ "inputs": { "mach-nix": "mach-nix", "nixpkgs": "nixpkgs_2", - "nixpkgsPy36": "nixpkgsPy36", "pypiIndex": "pypiIndex" } } diff --git a/flake.nix b/flake.nix index b32a304f416..c4d707b5f95 100644 --- a/flake.nix +++ b/flake.nix @@ -2,7 +2,6 @@ inputs = { mach-nix.url = "mach-nix"; nixpkgs.url = "nixpkgs/nixos-unstable"; - nixpkgsPy36.url = "nixpkgs/b4db68ff563895eea6aab4ff24fa04ef403dfe14"; pypiIndex.url = "github:davhau/nix-pypi-fetcher"; pypiIndex.flake = false; }; @@ -13,7 +12,7 @@ let systems = ["x86_64-linux"]; self = { - lib.supportedPythonVersions = [ "27" "36" "37" "38" "39" "310" ]; + lib.supportedPythonVersions = [ "37" "38" "39" ]; lib.formatVersion = toInt (readFile ./FORMAT_VERSION); } // foldl' (a: b: recursiveUpdate a b) {} ( map ( system: @@ -32,20 +31,13 @@ pkgs.git pkgs.nixFlakes ]; - # py27 and p36 crash when taken from current nixpkgs - # this overlay mixes python interpreters from old and new nixpkgs - py36Overlay = pkgs.writeText "py36-overlay.nix" '' + pyOverlay = pkgs.writeText "py36-overlay.nix" '' [(curr: prev: - let - pkgsNew = import ${inp.nixpkgs} {}; - in rec { + rec { useInterpreters = [ - prev.python27 - prev.python36 - pkgsNew.python37 - pkgsNew.python38 - pkgsNew.python39 - pkgsNew.python310 + prev.python37 + prev.python38 + prev.python39 ]; } )] @@ -60,7 +52,7 @@ EXTRACTOR_SRC = "${inp.mach-nix}/lib/extractor"; }; fixedVars = { - NIX_PATH = "nixpkgs=${inp.nixpkgsPy36}:nixpkgs-overlays=${py36Overlay}"; + NIX_PATH = "nixpkgs=${inp.nixpkgs}:nixpkgs-overlays=${pyOverlay}"; }; # defaultVars are only set if they are not already set # fixedVars are always set From 1422ff8a028dd320ed01392ae844820533019709 Mon Sep 17 00:00:00 2001 From: Tom Prince Date: Mon, 14 Feb 2022 10:55:24 -0700 Subject: [PATCH 3/4] Set error handling in bash scripts and reorder update steps. --- flake.nix | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/flake.nix b/flake.nix index c4d707b5f95..2a41c93a0e6 100644 --- a/flake.nix +++ b/flake.nix @@ -76,6 +76,7 @@ update-wheel.type = "app"; update-wheel.program = toString (pkgs.writeScript "update-wheel" '' #!/usr/bin/env bash + set -xeo pipefail ${exports} ${pyEnv}/bin/python ${./updater}/crawl_wheel_deps.py ''); @@ -84,6 +85,7 @@ update-sdist.type = "app"; update-sdist.program = toString (pkgs.writeScript "update-sdist" '' #!/usr/bin/env bash + set -xeo pipefail ${exports} ${pyEnv}/bin/python ${./updater}/crawl_sdist_deps.py ''); @@ -92,8 +94,7 @@ job-sdist-wheel.type = "app"; job-sdist-wheel.program = toString (pkgs.writeScript "job-sdist" '' #!/usr/bin/env bash - set -e - set -x + set -xeo pipefail # update the index to get the newest packages indexRevPrev=$(${pkgs.nixFlakes}/bin/nix flake metadata --json | ${pkgs.jq}/bin/jq -e --raw-output '.locks .nodes .pypiIndex .locked .rev') @@ -104,18 +105,18 @@ exit 0 fi + indexHash=$(${pkgs.nixFlakes}/bin/nix flake metadata --json | ${pkgs.jq}/bin/jq -e --raw-output '.locks .nodes .pypiIndex .locked .narHash') + echo $indexRev > PYPI_FETCHER_COMMIT + echo $indexHash > PYPI_FETCHER_SHA256 + # crawl wheel and sdist packages # If CI system has a run time limit, make sure to set MAX_MINUTES_WHEEL and MAX_MINUTES_SDIST # time ratio for wheel/sdist should be around 1/10 - MAX_MINUTES=''${MAX_MINUTES_WHEEL:-0} ${pkgs.nixFlakes}/bin/nix run .#update-wheel - MAX_MINUTES=''${MAX_MINUTES_SDIST:-0} ${pkgs.nixFlakes}/bin/nix run .#update-sdist + MAX_MINUTES=''${MAX_MINUTES_WHEEL:-0} ${update-wheel.program} + MAX_MINUTES=''${MAX_MINUTES_SDIST:-0} ${update-sdist.program} # commit to git echo $(date +%s) > UNIX_TIMESTAMP - indexHash=$(${pkgs.nixFlakes}/bin/nix flake metadata --json | ${pkgs.jq}/bin/jq -e --raw-output '.locks .nodes .pypiIndex .locked .narHash') - echo $indexRev > PYPI_FETCHER_COMMIT - echo $indexHash > PYPI_FETCHER_SHA256 - git add sdist sdist-errors wheel flake.lock UNIX_TIMESTAMP PYPI_FETCHER_COMMIT PYPI_FETCHER_SHA256 git pull origin $(git rev-parse --abbrev-ref HEAD) git commit -m "$(date) - update sdist + wheel" @@ -133,4 +134,4 @@ }) systems); in self; -} \ No newline at end of file +} From 426c5cdbb8deb9b85a0781a3275944a6c225d8a0 Mon Sep 17 00:00:00 2001 From: Tom Prince Date: Mon, 14 Feb 2022 10:57:09 -0700 Subject: [PATCH 4/4] Use pep517-metadata branch of mach-nix for generating metadata. --- flake.lock | 54 ++---- flake.nix | 4 +- updater/crawl_sdist_deps.py | 365 +++++++++++++++++++++++------------- 3 files changed, 250 insertions(+), 173 deletions(-) diff --git a/flake.lock b/flake.lock index 9c2d8b632a5..84a709ac877 100644 --- a/flake.lock +++ b/flake.lock @@ -18,39 +18,27 @@ "mach-nix": { "inputs": { "flake-utils": "flake-utils", - "nixpkgs": "nixpkgs", - "pypi-deps-db": "pypi-deps-db" + "nixpkgs": [ + "nixpkgs" + ], + "pypi-deps-db": [] }, "locked": { - "lastModified": 1618982703, - "narHash": "sha256-9dSHtu9XfK04caPx6kXvmhcnC3t+8bS65yqS6U4nQ/g=", - "owner": "DavHau", + "lastModified": 1644005294, + "narHash": "sha256-y1Ifu7I3s7CKNGlg6fTJogj87ZKdhMlLpjzdlOYwonc=", + "owner": "PrivateStorageio", "repo": "mach-nix", - "rev": "9543999c8abb56524347a6236e7b1c5e4aaa2c1d", + "rev": "cdda6ae52cb8f32c13ef993f8ab6de1c888f9ff3", "type": "github" }, "original": { - "id": "mach-nix", - "type": "indirect" - } - }, - "nixpkgs": { - "locked": { - "lastModified": 1615259932, - "narHash": "sha256-IXecmbqCr+XCtFwzBO3tHEd8PoJ4X4EyPZebKbV2ioE=", - "owner": "NixOS", - "repo": "nixpkgs", - "rev": "29b0d4d0b600f8f5dd0b86e3362a33d4181938f9", - "type": "github" - }, - "original": { - "owner": "NixOS", - "ref": "nixos-unstable", - "repo": "nixpkgs", + "owner": "PrivateStorageio", + "ref": "pep517-metadata", + "repo": "mach-nix", "type": "github" } }, - "nixpkgs_2": { + "nixpkgs": { "locked": { "lastModified": 1618619705, "narHash": "sha256-+yBGazqJxjT+BR00oCNamOgiEFPHBOPkqak7MUYcpBA=", @@ -65,22 +53,6 @@ "type": "indirect" } }, - "pypi-deps-db": { - "flake": false, - "locked": { - "lastModified": 1615363940, - "narHash": "sha256-GJ3ONLWAr5ejqR5bKVfKpI/n+ClaxjyYewMP+QJyq5M=", - "owner": "DavHau", - "repo": "pypi-deps-db", - "rev": "ab522f2d3255789f1ef97fa7c83d4342be156e67", - "type": "github" - }, - "original": { - "owner": "DavHau", - "repo": "pypi-deps-db", - "type": "github" - } - }, "pypiIndex": { "flake": false, "locked": { @@ -100,7 +72,7 @@ "root": { "inputs": { "mach-nix": "mach-nix", - "nixpkgs": "nixpkgs_2", + "nixpkgs": "nixpkgs", "pypiIndex": "pypiIndex" } } diff --git a/flake.nix b/flake.nix index 2a41c93a0e6..a29c4ada9c1 100644 --- a/flake.nix +++ b/flake.nix @@ -1,6 +1,8 @@ { inputs = { - mach-nix.url = "mach-nix"; + mach-nix.url = "github:PrivateStorageio/mach-nix/pep517-metadata"; + mach-nix.inputs.nixpkgs.follows = "nixpkgs"; + mach-nix.inputs.pypi-deps-db.follows = ""; nixpkgs.url = "nixpkgs/nixos-unstable"; pypiIndex.url = "github:davhau/nix-pypi-fetcher"; pypiIndex.flake = false; diff --git a/updater/crawl_sdist_deps.py b/updater/crawl_sdist_deps.py index d9e40a045b0..b672ae036ae 100644 --- a/updater/crawl_sdist_deps.py +++ b/updater/crawl_sdist_deps.py @@ -2,6 +2,7 @@ import multiprocessing import os import re +import shlex import shutil import subprocess as sp import traceback @@ -9,7 +10,7 @@ from random import shuffle from tempfile import TemporaryDirectory from time import time -from typing import Union, List, ContextManager +from typing import ContextManager, List, Union import utils from bucket_dict import LazyBucketDict @@ -51,28 +52,38 @@ class PKG: def compute_drvs(jobs: List[PackageJob], extractor_src, store=None): - extractor_jobs = list(dict( - pkg=job.name, - version=job.version, - url=job.url, - sha256=job.sha256, - pyVersions=job.py_versions, - ) for job in jobs) + extractor_jobs = list( + dict( + pkg=job.name, + version=job.version, + url=job.url, + sha256=job.sha256, + pyVersions=job.py_versions, + ) + for job in jobs + ) with TemporaryDirectory() as tempdir: jobs_file = f"{tempdir}/jobs.json" - with open(jobs_file, 'w') as f: + with open(jobs_file, "w") as f: json.dump(extractor_jobs, f) - os.environ['EXTRACTOR_JOBS_JSON_FILE'] = jobs_file - cmd = ["nix", "eval", "--impure", "-f", f"{extractor_src}/make-drvs.nix",] + os.environ["EXTRACTOR_JOBS_JSON_FILE"] = jobs_file + cmd = [ + "nix", + "eval", + "-L", + "--impure", + "--json", + "--expr", + f"import {extractor_src}/make-drvs.nix {{ pypiData = ./.; }}", + ] if store: cmd += ["--store", store] - print(' '.join(cmd).replace(' "', ' \'"').replace('" ', '"\' ')) + print(" ".join(cmd).replace(' "', " '\"").replace('" ', "\"' ")) try: - nix_eval_result = sp.run(cmd, capture_output=True, check=True) + nix_eval_result = sp.run(cmd, check=True, stdout=sp.PIPE) except sp.CalledProcessError as e: - print(e.stderr) raise - result = json.loads(json.loads(nix_eval_result.stdout)) + result = json.loads(nix_eval_result.stdout) for job in jobs: job.drv = result[f"{job.name}#{job.version}"] @@ -102,9 +113,7 @@ def format_error(log: str, pkg_version): log = re.sub(pkg_version, "#PKG_VER#", log) # detect some common errors and shorten them - common = ( - 'unpacker produced multiple directories', - ) + common = ("unpacker produced multiple directories",) for err in common: if err in log: log = err @@ -119,52 +128,85 @@ def format_error(log: str, pkg_version): lines = log.splitlines(keepends=True) lines = map(lambda line: line[:400], lines) remove_lines_marker = ( - '/homeless-shelter/.cache/pip/http', - '/homeless-shelter/.cache/pip', - 'DEPRECATION: Python 2.7' + "/homeless-shelter/.cache/pip/http", + "/homeless-shelter/.cache/pip", + "DEPRECATION: Python 2.7", ) - filtered = filter(lambda l: not any(marker in l for marker in remove_lines_marker), lines) - return ''.join(list(filtered)[:90]) + filtered = filter( + lambda l: not any(marker in l for marker in remove_lines_marker), lines + ) + return "".join(list(filtered)[:90]) -def extract_requirements(job: PackageJob, deadline, total_num, store=None): +def extract_requirements(job: PackageJob, deadline, total_num, store, extractor_src): try: if deadline and time() > deadline: raise Exception("Deadline occurred. Skipping this job") - print(f"Bucket {job.bucket} - Job {job.idx+1}/{total_num} - " - f"{job.name}:{job.version} (py: {' '.join(job.py_versions)})") + print( + f"Bucket {job.bucket} - Job {job.idx+1}/{total_num} - " + f"{job.name}:{job.version} (py: {' '.join(job.py_versions)})" + ) with TemporaryDirectory() as tempdir: out_dir = f"{tempdir}/json" - cmd = ["nix-build", job.drv, "-o", out_dir] + cmd = [ + "nix", + "build", + "-L", + "-f", + f"{extractor_src}/fast-extractor.nix", + "--arg", + "pypiData", + "./.", + "--argstr", + "argsJSON", + json.dumps( + dict( + pkg=job.name, + version=job.version, + url=job.url, + sha256=job.sha256, + pyVersions=job.py_versions, + ) + ), + "-o", + out_dir, + ] if store: cmd += ["--store", store] - # print(' '.join(cmd).replace(' "', ' \'"').replace('" ', '"\' ')) + print(" ".join(map(shlex.quote, cmd))) try: sp.run(cmd, capture_output=True, timeout=job.timeout, check=True) except (sp.CalledProcessError, sp.TimeoutExpired) as e: print(f"problem with {job.name}:{job.version}\n{e.stderr.decode()}") formatted = format_error(e.stderr.decode(), job.version) # in case GC didn't kick in early enough, we need to ignore the results - if any(s in formatted for s in ( - "o space left on device", - "lack of free disk space")): + if any( + s in formatted + for s in ("o space left on device", "lack of free disk space") + ): return e - return [JobResult( - name=job.name, - version=job.version, - py_ver=f"{py_ver}", - error=formatted, - ) for py_ver in job.py_versions] + return [ + JobResult( + name=job.name, + version=job.version, + py_ver=f"{py_ver}", + error=formatted, + ) + for py_ver in job.py_versions + ] results = [] + path = os.readlink(f"{out_dir}") + if store: + path = path.replace("/nix/store", f"{store}/nix/store") + with open(f"{path}/build-system.json") as f: + content = f.read().strip() + build_system = json.loads(content) for py_ver in job.py_versions: data = None try: - path = os.readlink(f"{out_dir}") - if store: - path = path.replace('/nix/store', f"{store}/nix/store") with open(f"{path}/python{py_ver}.json") as f: content = f.read().strip() - if content != '': + if content != "": data = json.loads(content) except FileNotFoundError: pass @@ -172,29 +214,38 @@ def extract_requirements(job: PackageJob, deadline, total_num, store=None): with open(f"{path}/python{py_ver}.log") as f: error = format_error(f.read(), job.version) print(error) - results.append(JobResult( - name=job.name, - version=job.version, - py_ver=f"{py_ver}", - error=error, - )) + results.append( + JobResult( + name=job.name, + version=job.version, + py_ver=f"{py_ver}", + error=error, + ) + ) else: - for k in ('name', 'version'): + for k in ("name", "version"): if k in data: del data[k] - results.append(JobResult( - name=job.name, - version=job.version, - py_ver=py_ver, - **data - )) + results.append( + JobResult( + name=job.name, + version=job.version, + py_ver=py_ver, + install_requires=data["requires_dist"], + setup_requires=build_system["requires"] + + data["build-requires"], + python_requires=data["requires_python"], + ) + ) return results except Exception as e: traceback.print_exc() return e -def get_jobs(pypi_index, error_dict, pkgs_dict, bucket, py_vers, limit_num=None, limit_names=None): +def get_jobs( + pypi_index, error_dict, pkgs_dict, bucket, py_vers, limit_num=None, limit_names=None +): jobs: List[PackageJob] = [] names = list(pypi_index.by_bucket(bucket).keys()) total_nr = 0 @@ -202,7 +253,7 @@ def get_jobs(pypi_index, error_dict, pkgs_dict, bucket, py_vers, limit_num=None, if limit_names and pkg_name not in limit_names: continue for ver, release_types in pypi_index[pkg_name].items(): - if 'sdist' not in release_types: + if "sdist" not in release_types: continue total_nr += 1 # collect python versions for which no data exists yet @@ -218,16 +269,18 @@ def get_jobs(pypi_index, error_dict, pkgs_dict, bucket, py_vers, limit_num=None, required_py_vers.append(pyver) if not required_py_vers: continue - release = release_types['sdist'] - jobs.append(PackageJob( - bucket, - pkg_name, - ver, - f"https://files.pythonhosted.org/packages/source/{pkg_name[0]}/{pkg_name}/{release[1]}", - release[0], - 0, - py_versions=required_py_vers, - )) + release = release_types["sdist"] + jobs.append( + PackageJob( + bucket, + pkg_name, + ver, + f"https://files.pythonhosted.org/packages/source/{pkg_name[0]}/{pkg_name}/{release[1]}", + release[0], + 0, + py_versions=required_py_vers, + ) + ) # because some packages are significantly bigger than others, we shuffle all jobs # to prevent fluctuations in CPU usage shuffle(jobs) @@ -245,28 +298,43 @@ def get_jobs(pypi_index, error_dict, pkgs_dict, bucket, py_vers, limit_num=None, for i, job in enumerate(jobs): job.idx = i - print(f"Bucket {bucket}: {len(jobs)} out of {total_nr} total sdist releases need to be updated") + print( + f"Bucket {bucket}: {len(jobs)} out of {total_nr} total sdist releases need to be updated" + ) return jobs def get_processed(): - with open('/tmp/jobs', 'r') as f: + with open("/tmp/jobs", "r") as f: return {tuple(t) for t in json.load(f)} def build_base(extractor_src, py_vers, store=None): - name = 'requests' - version = '2.22.0' - url = 'https://files.pythonhosted.org/packages/01/62/' \ - 'ddcf76d1d19885e8579acb1b1df26a852b03472c0e46d2b959a714c90608/requests-2.22.0.tar.gz' - sha256 = '11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4' + name = "requests" + version = "2.22.0" + url = ( + "https://files.pythonhosted.org/packages/01/62/" + "ddcf76d1d19885e8579acb1b1df26a852b03472c0e46d2b959a714c90608/requests-2.22.0.tar.gz" + ) + sha256 = "11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4" cmd = [ - "nix-build", f"{extractor_src}/fast-extractor.nix", - "--arg", "url", f'"{url}"', - "--arg", "sha256", f'"{sha256}"', - "--arg", "pkg", f'"{name}"', - "--arg", "version", f'"{version}"', - "--arg", "pyVersions", f'''[ {" ".join(map(lambda p: f'"{p}"', py_vers))} ]''', + "nix-build", + f"{extractor_src}/fast-extractor.nix", + "--arg", + "url", + f'"{url}"', + "--arg", + "sha256", + f'"{sha256}"', + "--arg", + "pkg", + f'"{name}"', + "--arg", + "version", + f'"{version}"', + "--arg", + "pyVersions", + f"""[ {" ".join(map(lambda p: f'"{p}"', py_vers))} ]""", "--no-out-link", ] if store: @@ -274,20 +342,21 @@ def build_base(extractor_src, py_vers, store=None): sp.check_call(cmd, timeout=1000) - def pkg_to_dict(pkg): - pkg_dict = asdict(PKG( - install_requires=pkg.install_requires, - setup_requires=pkg.setup_requires, - extras_require=pkg.extras_require, - tests_require=pkg.tests_require, - python_requires=pkg.python_requires - )) + pkg_dict = asdict( + PKG( + install_requires=pkg.install_requires, + setup_requires=pkg.setup_requires, + extras_require=pkg.extras_require, + tests_require=pkg.tests_require, + python_requires=pkg.python_requires, + ) + ) new_release = {} for key, val in pkg_dict.items(): if not val: continue - if key == 'extras_require': + if key == "extras_require": for extra_key, extra_reqs in val.items(): val[extra_key] = list(flatten_req_list(extra_reqs)) if key not in flatten_keys: @@ -298,7 +367,7 @@ def pkg_to_dict(pkg): val = [val] if not all(isinstance(elem, str) for elem in val): print(val) - raise Exception('Requirements must be list of strings') + raise Exception("Requirements must be list of strings") new_release[key] = val return new_release @@ -317,14 +386,14 @@ def flatten_req_list(obj): for s in flatten_req_list(elem): yield s else: - raise Exception('Is not list or str') + raise Exception("Is not list or str") flatten_keys = ( - 'setup_requires', - 'install_requires', - 'tests_require', - 'python_requires', + "setup_requires", + "install_requires", + "tests_require", + "python_requires", ) @@ -381,37 +450,49 @@ def purge(pypi_index, pkgs_dict: LazyBucketDict, bucket, py_vers): # purge all versions which are not on pypi anymore for name, vers in pkgs_dict.by_bucket(bucket).copy().items(): if name not in pypi_index: - print(f"deleting package {name} from DB because it has been removed from pypi") + print( + f"deleting package {name} from DB because it has been removed from pypi" + ) del pkgs_dict[name] continue for ver in tuple(vers.keys()): if ver not in pypi_index[name]: - print(f"deleting package {name} version {ver} from DB because it has been removed from pypi") + print( + f"deleting package {name} version {ver} from DB because it has been removed from pypi" + ) del pkgs_dict[name][ver] # purge old python versions for name, vers in pkgs_dict.by_bucket(bucket).copy().items(): for ver, pyvers in vers.copy().items(): for pyver in tuple(pyvers.keys()): if pyver not in py_vers: - print(f"deleting package {name} version {ver} for python {pyver}" - f" from DB because we dropped support for this python version") + print( + f"deleting package {name} version {ver} for python {pyver}" + f" from DB because we dropped support for this python version" + ) del pkgs_dict[name][ver][pyver] if len(pkgs_dict[name][ver]) == 0: - print(f"deleting package {name} version {ver} from DB" - f" because it is not compatible with any of our supported python versions") + print( + f"deleting package {name} version {ver} from DB" + f" because it is not compatible with any of our supported python versions" + ) del pkgs_dict[name][ver] if len(pkgs_dict[name]) == 0: - print(f"deleting package {name} from DB" - f" because it has no releases left which are compatible with any of our supported python versions") + print( + f"deleting package {name} from DB" + f" because it has no releases left which are compatible with any of our supported python versions" + ) del pkgs_dict[name] class Measure(ContextManager): def __init__(self, name): self.name = name + def __enter__(self): self.enter_time = time() print(f'beginning "{self.name}"') + def __exit__(self, exc_type, exc_val, exc_tb): dur = round(time() - self.enter_time, 1) print(f'"{self.name}" took {dur}s') @@ -419,28 +500,32 @@ def __exit__(self, exc_type, exc_val, exc_tb): def main(): # settings related to performance/parallelization - amount_buckets = int(os.environ.get('AMOUNT_BUCKETS', "256")) - limit_names = set(filter(lambda n: bool(n), os.environ.get('LIMIT_NAMES', "").split(','))) - max_minutes = int(os.environ.get('MAX_MINUTES', "0")) - bucket_jobs = int(os.environ.get('BUCKET_JOBS', "0")) - start_bucket = int(os.environ.get('BUCKET_START', "0")) - workers = int(os.environ.get('WORKERS', multiprocessing.cpu_count() * 2)) + amount_buckets = int(os.environ.get("AMOUNT_BUCKETS", "256")) + limit_names = set( + filter(lambda n: bool(n), os.environ.get("LIMIT_NAMES", "").split(",")) + ) + max_minutes = int(os.environ.get("MAX_MINUTES", "0")) + bucket_jobs = int(os.environ.get("BUCKET_JOBS", "0")) + start_bucket = int(os.environ.get("BUCKET_START", "0")) + workers = int(os.environ.get("WORKERS", multiprocessing.cpu_count() * 2)) # general settings - dump_dir = os.environ.get('DUMP_DIR', "./sdist") + dump_dir = os.environ.get("DUMP_DIR", "./sdist") extractor_src = os.environ.get("EXTRACTOR_SRC") if not extractor_src: raise Exception("Set env variable 'EXTRACTOR_SRC to {mach-nix}/lib/extractor'") - min_free_gb = int(os.environ.get('MIN_FREE_GB', "0")) - py_vers_short = os.environ.get('PYTHON_VERSIONS', "27,36,37,38,39,310").strip().split(',') - pypi_fetcher_dir = os.environ.get('PYPI_FETCHER', '/tmp/pypi_fetcher') - store = os.environ.get('STORE', None) + min_free_gb = int(os.environ.get("MIN_FREE_GB", "0")) + py_vers_short = ( + os.environ.get("PYTHON_VERSIONS", "27,36,37,38,39,310").strip().split(",") + ) + pypi_fetcher_dir = os.environ.get("PYPI_FETCHER", "/tmp/pypi_fetcher") + store = os.environ.get("STORE", None) deadline_total = time() + max_minutes * 60 if max_minutes else None # cache build time deps, otherwise first job will be slow - with Measure("ensure build time deps"): - build_base(extractor_src, py_vers_short, store=store) + # with Measure("ensure build time deps"): + # build_base(extractor_src, py_vers_short, store=store) garbage_collected = False @@ -454,12 +539,16 @@ def main(): if idx < start_bucket or idx >= start_bucket + amount_buckets: continue pkgs_dict = LazyBucketDict(dump_dir, restrict_to_bucket=bucket) - pypi_index = LazyBucketDict(f"{pypi_fetcher_dir}/pypi", restrict_to_bucket=bucket) + pypi_index = LazyBucketDict( + f"{pypi_fetcher_dir}/pypi", restrict_to_bucket=bucket + ) # load error data error_dict = LazyBucketDict(dump_dir + "-errors", restrict_to_bucket=bucket) decompress(error_dict.by_bucket(bucket)) - with Measure('Get processed pkgs'): - print(f"DB contains {len(list(pkgs_dict.keys()))} pkgs at this time for bucket {bucket}") + with Measure("Get processed pkgs"): + print( + f"DB contains {len(list(pkgs_dict.keys()))} pkgs at this time for bucket {bucket}" + ) with Measure("decompressing data"): decompress(pkgs_dict.by_bucket(bucket)) # purge data for old python versions and packages which got deleted from pypi @@ -467,17 +556,20 @@ def main(): purge(pypi_index, pkgs_dict, bucket, py_vers_short) with Measure("getting jobs"): jobs = get_jobs( - pypi_index, error_dict, pkgs_dict, bucket, py_vers_short, limit_num=bucket_jobs, limit_names=limit_names) + pypi_index, + error_dict, + pkgs_dict, + bucket, + py_vers_short, + limit_num=bucket_jobs, + limit_names=limit_names, + ) if not jobs: continue - compute_drvs(jobs, extractor_src, store=store) # ensure that all the build time dependencies are cached before starting, # otherwise jobs might time out - if garbage_collected: - with Measure("ensure build time deps"): - build_base(extractor_src, py_vers_short, store=store) - with Measure('executing jobs'): + with Measure("executing jobs"): if workers > 1: pool_results = utils.parallel( extract_requirements, @@ -485,12 +577,19 @@ def main(): jobs, (deadline,) * len(jobs), (len(jobs),) * len(jobs), - (store,) * len(jobs) + (store,) * len(jobs), + (extractor_src,) * len(jobs), ), workers=workers, - use_processes=False) + use_processes=False, + ) else: - pool_results = [extract_requirements(args, deadline, store) for args in jobs] + pool_results = [ + extract_requirements( + args, deadline, len(jobs), store, extractor_src + ) + for args in jobs + ] # filter out exceptions results = [] @@ -500,13 +599,17 @@ def main(): results.append(r) # insert new data - for pkg in sorted(results, key=lambda pkg: (pkg.name, pkg.version, sort_key_pyver(pkg.py_ver))): - py_ver = ''.join(filter(lambda c: c.isdigit(), pkg.py_ver)) + for pkg in sorted( + results, key=lambda pkg: (pkg.name, pkg.version, sort_key_pyver(pkg.py_ver)) + ): + py_ver = "".join(filter(lambda c: c.isdigit(), pkg.py_ver)) if pkg.error: target = error_dict else: target = pkgs_dict - insert(py_ver, pkg.name, pkg.version, pkg_to_dict(pkg), target, error=pkg.error) + insert( + py_ver, pkg.name, pkg.version, pkg_to_dict(pkg), target, error=pkg.error + ) # compress and save with Measure("compressing data"): @@ -523,7 +626,7 @@ def main(): sp.run( f"nix-collect-garbage {f'--store {store}' if store else ''}", capture_output=True, - shell=True + shell=True, ) garbage_collected = True