Skip to content

Commit

Permalink
Refactor to use C++ implementation of requirement gathering for Python.
Browse files Browse the repository at this point in the history
  • Loading branch information
dom96 committed Feb 18, 2025
1 parent 69ce716 commit 3895bf9
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 30 deletions.
34 changes: 4 additions & 30 deletions src/pyodide/internal/setupPackages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ import {
LOAD_WHEELS_FROM_R2,
LOCKFILE,
LOAD_WHEELS_FROM_ARTIFACT_BUNDLER,
PACKAGES_VERSION,
USING_OLDEST_PACKAGES_VERSION,
} from 'pyodide-internal:metadata';
import { simpleRunPython } from 'pyodide-internal:util';
import { default as EmbeddedPackagesTarReader } from 'pyodide-internal:packages_tar_reader';
import { default as MetadataReader } from 'pyodide-internal:runtime-generated/metadata';

const canonicalizeNameRegex = /[-_.]+/g;

Expand Down Expand Up @@ -215,34 +215,6 @@ function disabledLoadPackage(): never {
);
}

/**
* Get the set of transitive requirements from the REQUIREMENTS metadata.
*/
function getTransitiveRequirements(): Set<string> {
// TODO: Package snapshot built on '20240829.4' is broken when the stdlib packages are added
// here, so we disable this logic. We should remove this check once the next Python and packages
// versions are rolled out.
let requirements = REQUIREMENTS;
if (!USING_OLDEST_PACKAGES_VERSION) {
// We have had a regression where the lockfile format changed and so no stdlib packages were
// found. This created a strange and unfriendly error message, so we throw a nice error here if
// there are no stdlib packages to avoid this if it happens in the future.
if (STDLIB_PACKAGES.length == 0) {
throw new Error(
'Found no STDLIB_PACKAGES, expected at least 1. Lock file corrupted?'
);
}
requirements = Array.from(
new Set(requirements).union(new Set(STDLIB_PACKAGES)),
canonicalizePackageName
);
}

// resolve transitive dependencies of requirements and if IN_WORKERD install them from the cdn.
const packageDatas = recursiveDependencies(LOCKFILE, requirements);
return new Set(packageDatas.map(({ name }) => canonicalizePackageName(name)));
}

export function getSitePackagesPath(Module: Module): string {
const pymajor = Module._py_version_major();
const pyminor = Module._py_version_minor();
Expand Down Expand Up @@ -340,5 +312,7 @@ function addPackageToLoad(
}

export { REQUIREMENTS };
export const TRANSITIVE_REQUIREMENTS = getTransitiveRequirements();
export const TRANSITIVE_REQUIREMENTS = new Set(
MetadataReader.getTransitiveRequirements()
);
export const VIRTUALIZED_DIR = buildVirtualizedDir(TRANSITIVE_REQUIREMENTS);
1 change: 1 addition & 0 deletions src/pyodide/types/runtime-generated/metadata.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ declare namespace MetadataReader {
const getPackagesVersion: () => string;
const getPackagesLock: () => string;
const read: (index: number, position: number, buffer: Uint8Array) => number;
const getTransitiveRequirements: () => string[];
}

export default MetadataReader;
2 changes: 2 additions & 0 deletions src/workerd/api/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,12 @@ wd_cc_library(
name = "pyodide",
srcs = [
"pyodide/pyodide.c++",
"pyodide/requirements.c++",
"pyodide/setup-emscripten.c++",
],
hdrs = [
"pyodide/pyodide.h",
"pyodide/requirements.h",
"pyodide/setup-emscripten.h",
"//src/pyodide:generated/pyodide_extra.capnp.h",
],
Expand Down
14 changes: 14 additions & 0 deletions src/workerd/api/pyodide/pyodide.c++
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
// https://opensource.org/licenses/Apache-2.0
#include "pyodide.h"

#include "requirements.h"

#include <workerd/api/pyodide/setup-emscripten.h>
#include <workerd/io/compatibility-date.h>
#include <workerd/util/string-buffer.h>
Expand Down Expand Up @@ -117,6 +119,18 @@ int PyodideMetadataReader::readMemorySnapshot(int offset, kj::Array<kj::byte> bu
return readToTarget(KJ_REQUIRE_NONNULL(memorySnapshot), offset, buf);
}

kj::Array<kj::String> PyodideMetadataReader::getTransitiveRequirements() {
auto packages = parseLockFile(packagesLock);
auto depMap = getDepMapFromPackagesLock(*packages);

auto allRequirements = getPythonPackageNames(*packages, depMap, requirements, packagesVersion);
auto result = kj::heapArrayBuilder<kj::String>(allRequirements.size());
for (const auto& r: allRequirements) {
result.add(kj::str(r));
}
return result.finish();
}

int ArtifactBundler::readMemorySnapshot(int offset, kj::Array<kj::byte> buf) {
if (existingSnapshot == kj::none) {
return 0;
Expand Down
3 changes: 3 additions & 0 deletions src/workerd/api/pyodide/pyodide.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,8 @@ class PyodideMetadataReader: public jsg::Object {
return kj::str(packagesLock);
}

kj::Array<kj::String> getTransitiveRequirements();

JSG_RESOURCE_TYPE(PyodideMetadataReader) {
JSG_METHOD(isWorkerd);
JSG_METHOD(isTracing);
Expand All @@ -190,6 +192,7 @@ class PyodideMetadataReader: public jsg::Object {
JSG_METHOD(getPackagesVersion);
JSG_METHOD(getPackagesLock);
JSG_METHOD(isCreatingBaselineSnapshot);
JSG_METHOD(getTransitiveRequirements);
}

void visitForMemoryInfo(jsg::MemoryTracker& tracker) const {
Expand Down
132 changes: 132 additions & 0 deletions src/workerd/api/pyodide/requirements.c++
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
// Copyright (c) 2017-2022 Cloudflare, Inc.
// Licensed under the Apache 2.0 license found in the LICENSE file or at:
// https://opensource.org/licenses/Apache-2.0
#include "requirements.h"

#include <cctype>

namespace workerd::api::pyodide {

// getField gets a field of a JSON object by key
capnp::json::Value::Reader getField(
capnp::List<::capnp::json::Value::Field, capnp::Kind::STRUCT>::Reader &object,
kj::StringPtr name) {
for (const auto &ent: object) {
if (ent.getName() == name) {
return ent.getValue();
}
}

KJ_FAIL_ASSERT("Expected key in JSON object", name);
}

kj::String canonicalizePythonPackageName(kj::StringPtr name) {
kj::Vector<char> res(name.size());

auto isSeparator = [](char c) { return c == '-' || c == '_' || c == '.'; };

for (int i = 0; i < name.size(); i++) {
if (isSeparator(name[i])) {
res.add('-');
// make i point to the last separator in the sequence
while (isSeparator(name[i])) i++;
i--;
continue;
}

res.add(std::tolower(name[i]));
}

res.add(0); // NUL terminator

return kj::String(res.releaseAsArray());
}

// getDepMapFromPackagesLock computes a dependency map (a mapping from requirement to list of dependencies) from the Pyodide lock file JSON
DepMap getDepMapFromPackagesLock(
capnp::List<capnp::json::Value::Field, capnp::Kind::STRUCT>::Reader &packages) {
DepMap res;

for (const auto &ent: packages) {
auto packageObj = ent.getValue().getObject();
auto depends = getField(packageObj, "depends").getArray();

auto &[_, deps] = res.insert(kj::str(ent.getName()), kj::Vector<kj::String>(depends.size()));

for (const auto &dep: depends) {
deps.add(kj::str(dep.getString()));
}
}

return res;
}

// addWithRecursiveDependencies adds a requirement along with all its dependencies (according to the dependency map) to the requirements set
void addWithRecursiveDependencies(
kj::StringPtr requirement, const DepMap &depMap, kj::HashSet<kj::String> &requirementsSet) {
auto normalizedName = canonicalizePythonPackageName(requirement);
if (requirementsSet.contains(normalizedName)) {
return;
}

requirementsSet.insert(kj::str(normalizedName));

KJ_IF_SOME(deps, depMap.find(normalizedName)) {
for (const auto &dep: deps) {
addWithRecursiveDependencies(dep, depMap, requirementsSet);
}
}
}

kj::Own<capnp::List<capnp::json::Value::Field>::Reader> parseLockFile(
kj::StringPtr lockFileContents) {
capnp::JsonCodec json;
capnp::MallocMessageBuilder message;

auto lock = message.initRoot<capnp::JsonValue>();
json.decodeRaw(lockFileContents, lock);

auto object = lock.getObject().asReader();
auto packages = getField(object, "packages").getObject();
return capnp::clone(packages);
}

kj::HashSet<kj::String> getPythonPackageNames(
capnp::List<capnp::json::Value::Field>::Reader packages,
const DepMap &depMap,
kj::ArrayPtr<kj::String> requirements,
kj::StringPtr packagesVersion) {

kj::Vector<kj::String> requirementsWithStdlib;
// TODO: Loading stdlib and its dependencies breaks package snapshots on "20240829.4".
// Remove this version check once a new package/python release is made.
if (packagesVersion != "20240829.4") {
// We need to scan the packages list for any packages that need to be included because they
// are part of Python's stdlib (hashlib etc). These need to be implicitly treated as part of
// our `requirements`.
for (const auto &ent: packages) {
auto name = ent.getName();
auto obj = ent.getValue().getObject();
auto packageType = getField(obj, "package_type").getString();

if (packageType == "cpython_module"_kj) {
requirementsWithStdlib.add(kj::str(name));
}
}
}

// Add our requirements.
for (const auto &req: requirements) {
requirementsWithStdlib.add(kj::str(req));
}

kj::HashSet<kj::String> allRequirements; // Requirements including their recursive dependencies.
// Add all recursive dependencies of each requirement.
for (const auto &req: requirementsWithStdlib) {
addWithRecursiveDependencies(req, depMap, allRequirements);
}

return allRequirements;
}

} // namespace workerd::api::pyodide
34 changes: 34 additions & 0 deletions src/workerd/api/pyodide/requirements.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright (c) 2017-2022 Cloudflare, Inc.
// Licensed under the Apache 2.0 license found in the LICENSE file or at:
// https://opensource.org/licenses/Apache-2.0
#pragma once

#include <capnp/compat/json.h>
#include <capnp/message.h>
#include <kj/common.h>
#include <kj/map.h>

namespace workerd::api::pyodide {

capnp::json::Value::Reader getField(
capnp::List<::capnp::json::Value::Field, capnp::Kind::STRUCT>::Reader &object,
kj::StringPtr name);

kj::String canonicalizePythonPackageName(kj::StringPtr name);

// map from requirement to list of dependencies
typedef kj::HashMap<kj::String, kj::Vector<kj::String>> DepMap;

DepMap getDepMapFromPackagesLock(
capnp::List<capnp::json::Value::Field, capnp::Kind::STRUCT>::Reader &packages);

kj::Own<capnp::List<capnp::json::Value::Field>::Reader> parseLockFile(
kj::StringPtr lockFileContents);

kj::HashSet<kj::String> getPythonPackageNames(
capnp::List<capnp::json::Value::Field>::Reader packages,
const DepMap &depMap,
kj::ArrayPtr<kj::String> requirements,
kj::StringPtr packagesVersion);

} // namespace workerd::api::pyodide

0 comments on commit 3895bf9

Please sign in to comment.