Staging
v0.8.1
https://github.com/python/cpython
Raw File
Tip revision: 550e4673be538d98b6ddf5550b3922539cf5c4b2 authored by Victor Stinner on 08 December 2020, 23:32:54 UTC
bpo-32381: Add _PyRun_SimpleFileObject() (GH-23709)
Tip revision: 550e467
download_pypi_packages.py
#!/usr/bin/env python3.8

import argparse
import os
import json

from typing import Dict, Any
from urllib.request import urlretrieve

argparser = argparse.ArgumentParser(
    prog="download_pypi_packages", description="Helper program to download PyPI packages",
)
argparser.add_argument(
    "-n", "--number", type=int, default=100, help="Number of packages to download"
)
argparser.add_argument(
    "-a", "--all", action="store_true", help="Download all packages listed in the json file"
)


def load_json(filename: str) -> Dict[Any, Any]:
    with open(os.path.join("data", f"{filename}.json"), "r") as f:
        j = json.loads(f.read())
    return j


def remove_json(filename: str) -> None:
    path = os.path.join("data", f"{filename}.json")
    os.remove(path)


def download_package_json(package_name: str) -> None:
    url = f"https://pypi.org/pypi/{package_name}/json"
    urlretrieve(url, os.path.join("data", f"{package_name}.json"))


def download_package_code(name: str, package_json: Dict[Any, Any]) -> None:
    source_index = -1
    for idx, url_info in enumerate(package_json["urls"]):
        if url_info["python_version"] == "source":
            source_index = idx
            break
    filename = package_json["urls"][source_index]["filename"]
    url = package_json["urls"][source_index]["url"]
    urlretrieve(url, os.path.join("data", "pypi", filename))


def main() -> None:
    args = argparser.parse_args()
    number_packages = args.number
    all_packages = args.all

    top_pypi_packages = load_json("top-pypi-packages-365-days")
    if all_packages:
        top_pypi_packages = top_pypi_packages["rows"]
    elif number_packages >= 0 and number_packages <= 4000:
        top_pypi_packages = top_pypi_packages["rows"][:number_packages]
    else:
        raise AssertionError("Unknown value for NUMBER_OF_PACKAGES")

    try:
        os.mkdir(os.path.join("data", "pypi"))
    except FileExistsError:
        pass

    for package in top_pypi_packages:
        package_name = package["project"]

        print(f"Downloading JSON Data for {package_name}... ", end="")
        download_package_json(package_name)
        print("Done")

        package_json = load_json(package_name)
        try:
            print(f"Dowloading and compressing package {package_name} ... ", end="")
            download_package_code(package_name, package_json)
            print("Done")
        except (IndexError, KeyError):
            print(f"Could not locate source for {package_name}")
            continue
        finally:
            remove_json(package_name)


if __name__ == "__main__":
    main()
back to top