Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

conda crawler implementation #532

Merged
merged 17 commits into from
May 2, 2024
5 changes: 5 additions & 0 deletions config/cdConfig.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ module.exports = {
fetched: { defaultTtlSeconds: fetchedCacheTtlSeconds }
},
cocoapods: { githubToken },
conda: {
cdFileLocation: cd_file.location
},
cratesio: {},
debian: { cdFileLocation: cd_file.location },
git: {},
Expand All @@ -50,6 +53,8 @@ module.exports = {
process: {
cdsource: {},
component: {},
conda: { githubToken },
condasrc: {},
crate: { githubToken },
deb: {},
debsrc: {},
Expand Down
12 changes: 12 additions & 0 deletions config/map.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@ const npm = {
fossology
}

const conda = {
_type: 'conda',
source,
clearlydefined,
licensee,
reuse,
scancode,
fossology
}

const crate = {
_type: 'crate',
source,
Expand Down Expand Up @@ -127,6 +137,7 @@ const gem = {
const _package = {
_type: 'package',
npm,
conda,
crate,
deb,
go,
Expand Down Expand Up @@ -156,6 +167,7 @@ const entities = {
licensee,
reuse,
npm,
conda,
crate,
deb,
go,
Expand Down
202 changes: 202 additions & 0 deletions providers/fetch/condaFetch.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
// Copyright (c) Microsoft Corporation and others. Licensed under the MIT license.
// SPDX-License-Identifier: MIT

const AbstractFetch = require('./abstractFetch')
const { clone } = require('lodash')
const fs = require('fs')
const memCache = require('memory-cache')
const nodeRequest = require('request')
const FetchResult = require('../../lib/fetchResult')

class CondaFetch extends AbstractFetch {
constructor(options) {
super(options)
this.packageMapFolder = this.options.cdFileLocation
this.channels = {
'anaconda-main': 'https://repo.anaconda.com/pkgs/main',
'anaconda-r': 'https://repo.anaconda.com/pkgs/r',
'conda-forge': 'https://conda.anaconda.org/conda-forge'
}
this.headers = {
'User-Agent': 'clearlydefined.io crawler (clearlydefined@outlook.com)'
}
this.CACHE_DURATION = 8 * 60 * 60 * 1000 // 8 hours
}

canHandle(request) {
const spec = this.toSpec(request)
return spec && !!(this.channels[spec.provider])
}

// {type: conda|condasrc}/{provider: anaconda-main|anaconda-r|conda-forge}/{architecture|-}/{package name}/[{version | }]-[{build version | }]/
// i.e. conda/conda-forge/linux-aarch64/numpy/1.13.0-py36/
// conda/conda-forge/-/numpy/-py36/
// conda/conda-forge/-/numpy/1.13.0-py36/
// conda/conda-forge/linux-aarch64/numpy/-py36/
// conda/conda-forge/-/numpy/
// conda/conda-forge/-/numpy/-
async handle(request) {
const spec = this.toSpec(request)
if (spec.type !== 'conda' && spec.type !== 'condasrc') {
return request.markSkip('spec type must either be conda or condasrc')
}
const channelData = await this.getChannelData(this.channels[spec.provider], spec.provider)
if (!channelData) {
return request.markSkip('failed to fetch and parse channelData.json')
}
let architecture = spec.namespace
let [version, buildVersion] = (spec.revision || '').split('-')
if (channelData.packages[spec.name] === undefined) {
return request.markSkip(`Missing package ${spec.name} in channel: ${spec.provider}`)
}
const packageChannelData = channelData.packages[spec.name]
if (spec.type === 'condasrc') {
return this._downloadCondaSourcePackage(spec, request, version, packageChannelData)
} else {
return this._downloadCondaPackage(
spec,
request,
version,
buildVersion,
architecture,
packageChannelData
)
}
}

async _downloadCondaSourcePackage(spec, request, version, packageChannelData) {
if (version && packageChannelData.version !== version) {
return request.markSkip(`Missing source file version ${version} for package ${spec.name}`)
}
if (!packageChannelData.source_url) {
return request.markSkip(`Missing archive source file in repodata for package ${spec.name}`)
}
let downloadUrl = new URL(`${packageChannelData.source_url}`).href
spec.revision = packageChannelData.version
request.url = spec.toUrl()
super.handle(request)
const file = this.createTempFile(request)
const dir = this.createTempDir(request)
await this._downloadPackage(downloadUrl, file.name)
await this.decompress(file.name, dir.name)
const hashes = await this.computeHashes(file.name)
const fetchResult = new FetchResult(request.url)
fetchResult.document = {
location: dir.name,
registryData: { 'channelData': packageChannelData, downloadUrl },
releaseDate: new Date(packageChannelData.timestamp || 0).toISOString(),
declaredLicenses: packageChannelData.license,
lamarrr marked this conversation as resolved.
Show resolved Hide resolved
hashes
}
fetchResult.casedSpec = clone(spec)
request.fetchResult = fetchResult.adoptCleanup(dir, request)
return request
}

_matchPackage(name, version, buildVersion, repoData) {
let packageRepoEntries = []
lamarrr marked this conversation as resolved.
Show resolved Hide resolved
let packageMatches = ([, packageData]) => {
return packageData.name === name && ((!version) || version === packageData.version)
&& ((!buildVersion) || packageData.build.startsWith(buildVersion))
}
if (repoData['packages']) {
packageRepoEntries = packageRepoEntries.concat(Object.entries(repoData['packages'])
.filter(packageMatches)
.map(([packageFile, packageData]) => { return { packageFile, packageData } }))
}
if (repoData['packages.conda']) {
packageRepoEntries = packageRepoEntries.concat(Object.entries(repoData['packages.conda'])
.filter(packageMatches)
.map(([packageFile, packageData]) => { return { packageFile, packageData } }))
}
packageRepoEntries.sort((a, b) => (b.packageData.timestamp || 0) - (a.packageData.timestamp || 0))
return packageRepoEntries
lamarrr marked this conversation as resolved.
Show resolved Hide resolved
}

async _downloadCondaPackage(spec, request, version, buildVersion, architecture, packageChannelData) {
if (!architecture || architecture === '-' && packageChannelData.subdirs.length > 0) {
// prefer no-arch if available
architecture = packageChannelData.subdirs.includes('noarch') ? 'noarch' : packageChannelData.subdirs[0]
this.logger.info(`No binary architecture specified for ${spec.name}, using architecture: ${architecture}`)
}
let repoData = undefined
if (!(packageChannelData.subdirs.find(x => x === architecture))) {
lamarrr marked this conversation as resolved.
Show resolved Hide resolved
return request.markSkip(`Missing architecture ${architecture} for package ${spec.name} in channel`)
}
repoData = await this.getRepoData(this.channels[spec.provider], spec.provider, architecture)
if (!repoData) {
return request.markSkip(`failed to fetch and parse repodata json file for channel ${spec.provider} in architecture ${architecture}`)
}
let packageRepoEntries = this._matchPackage(spec.name, version, buildVersion, repoData)
if (packageRepoEntries.length === 0) {
return request.markSkip(`Missing package with matching spec (version: ${version}, buildVersion: ${buildVersion}) in ${architecture} repository`)
}
let packageRepoEntry = packageRepoEntries[0]
let downloadUrl = new URL(`${this.channels[spec.provider]}/${architecture}/${packageRepoEntry.packageFile}`).href
spec.namespace = architecture
spec.revision = packageRepoEntry.packageData.version + '-' + packageRepoEntry.packageData.build
request.url = spec.toUrl()
super.handle(request)
const file = this.createTempFile(request)
const dir = this.createTempDir(request)
await this._downloadPackage(downloadUrl, file.name)
await this.decompress(file.name, dir.name)
const hashes = await this.computeHashes(file.name)
const fetchResult = new FetchResult(request.url)
fetchResult.document = {
location: dir.name,
registryData: { 'channelData': packageChannelData, 'repoData': packageRepoEntry, downloadUrl },
releaseDate: new Date(packageRepoEntry.packageData.timestamp || 0).toISOString(),
declaredLicenses: packageRepoEntry.packageData.license,
hashes
}
fetchResult.casedSpec = clone(spec)
request.fetchResult = fetchResult.adoptCleanup(dir, request)
return request
}

async _downloadPackage(downloadUrl, destination) {
return new Promise((resolve, reject) => {
const options = { url: downloadUrl, headers: this.headers }
nodeRequest.get(options, (error, response) => {
if (error) return reject(error)
if (response.statusCode !== 200) return reject(new Error(`${response.statusCode} ${response.statusMessage}`))
}).pipe(fs.createWriteStream(destination).on('finish', () => resolve()))
})
}

async _cachedDownload(cacheKey, sourceUrl, cacheDuration, fileDstLocation) {
if (!memCache.get(cacheKey)) {
return new Promise((resolve, reject) => {
const options = { url: sourceUrl, headers: this.headers }
nodeRequest.get(options, (error, response) => {
if (error) return reject(error)
if (response.statusCode !== 200) return reject(new Error(`${response.statusCode} ${response.statusMessage}`))
}).pipe(fs.createWriteStream(fileDstLocation).on('finish', () => {
memCache.put(cacheKey, true, cacheDuration)
this.logger.info(`Conda: retrieved ${sourceUrl}. Stored data file at ${fileDstLocation}`)
return resolve()
}))
})
}
}

async _fetchCachedJSONFile(cacheKey, url, cacheDuration, fileLocation) {
try {
await this._cachedDownload(cacheKey, url, cacheDuration, fileLocation)
} catch (error) {
return null
}
return JSON.parse(fs.readFileSync(fileLocation))
}

async getChannelData(condaChannelUrl, condaChannelID) {
return await this._fetchCachedJSONFile(`${condaChannelID}-channelDataFile`, `${condaChannelUrl}/channeldata.json`, this.CACHE_DURATION, `${this.packageMapFolder}/${condaChannelID}-channelDataFile.json`)
}

async getRepoData(condaChannelUrl, condaChannelID, architecture) {
return await this._fetchCachedJSONFile(`${condaChannelID}-repoDataFile-${architecture}`, `${condaChannelUrl}/${architecture}/repodata.json`, this.CACHE_DURATION, `${this.packageMapFolder}/${condaChannelID}-repoDataFile-${architecture}.json`)
}
}

module.exports = options => new CondaFetch(options)
4 changes: 3 additions & 1 deletion providers/fetch/dispatcher.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ class FetchDispatcher extends AbstractFetch {
if (!force && this.filter && !this.filter.shouldFetch(request)) return request
// get the right real fetcher for this request and dispatch
const handler = this._getHandler(request, this.fetchers)
if (!handler) throw new Error(`No fetcher found for ${request.toString()}`)
if (!handler) {
throw new Error(`No fetcher found for ${request.toString()}`)
}

await this._fetchResult(request, handler)
return request
Expand Down
3 changes: 3 additions & 0 deletions providers/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ module.exports = {
fetch: {
cdDispatch: require('./fetch/dispatcher'),
cocoapods: require('./fetch/podFetch'),
conda: require('./fetch/condaFetch'),
packagist: require('./fetch/packagistFetch'),
cratesio: require('./fetch/cratesioFetch'),
debian: require('./fetch/debianFetch'),
Expand All @@ -28,6 +29,8 @@ module.exports = {
process: {
cdsource: require('./process/sourceExtract'),
component: require('./process/component'),
conda: require('./process/condaExtract'),
condasrc: require('./process/condaSrcExtract'),
crate: require('./process/crateExtract'),
deb: require('./process/debExtract'),
debsrc: require('./process/debsrcExtract'),
Expand Down
63 changes: 63 additions & 0 deletions providers/process/condaExtract.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
const AbstractClearlyDefinedProcessor = require('./abstractClearlyDefinedProcessor')
const sourceDiscovery = require('../../lib/sourceDiscovery')
const { merge } = require('lodash')
const SourceSpec = require('../../lib/sourceSpec')

class CondaExtract extends AbstractClearlyDefinedProcessor {
constructor(options, sourceFinder) {
super(options)
this.sourceFinder = sourceFinder
}

get toolVersion() {
return '0.0.1'
}

canHandle(request) {
const spec = this.toSpec(request)
return request.type === 'conda' && spec && spec.type === 'conda'
}

async handle(request) {
if (this.isProcessing(request)) {
await super.handle(request)
const spec = this.toSpec(request)
const { releaseDate, registryData, declaredLicenses } = request.document
request.document = merge(this.clone(request.document), { releaseDate, registryData, declaredLicenses })
request.document.sourceInfo = await this._discoverSource(spec, registryData)
}
this.addLocalToolTasks(request)
if (request.document.sourceInfo) {
const sourceSpec = SourceSpec.fromObject(request.document.sourceInfo)
this.linkAndQueue(request, 'source', sourceSpec.toEntitySpec())
}
return request
}

async _discoverSource(spec, registryData) {
let sourceCandidates = [
registryData.channelData.source_url,
registryData.channelData.source_git_url,
registryData.channelData.home,
registryData.channelData.dev_url,
registryData.channelData.doc_url,
registryData.channelData.doc_source_url].filter(e => e)
let sourceInfo = undefined
const githubSource = await this.sourceFinder(
registryData.repoData.packageData.version, sourceCandidates, {
githubToken: this.options.githubToken,
logger: this.logger
})
if (githubSource) {
sourceInfo = githubSource
} else {
sourceInfo = SourceSpec.fromObject(spec)
sourceInfo.type = 'condasrc'
sourceInfo.namespace = null
sourceInfo.revision = spec.revision.split('-')[0]
}
return sourceInfo
}
}

module.exports = (options, sourceFinder) => new CondaExtract(options, sourceFinder || sourceDiscovery)
25 changes: 25 additions & 0 deletions providers/process/condaSrcExtract.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
const AbstractClearlyDefinedProcessor = require('./abstractClearlyDefinedProcessor')
const { merge } = require('lodash')

class CondaSrcExtract extends AbstractClearlyDefinedProcessor {
constructor(options) {
lamarrr marked this conversation as resolved.
Show resolved Hide resolved
super(options)
}

get toolVersion() {
return '0.0.1'
}

canHandle(request) {
const spec = this.toSpec(request)
return request.type === 'clearlydefined' && spec && spec.type === 'condasrc'
}

async handle(request) {
await super.handle(request)
const { releaseDate, registryData, declaredLicenses } = request.document
request.document = merge(this.clone(request.document), { releaseDate, registryData, declaredLicenses })
}
}

module.exports = (options) => new CondaSrcExtract(options)
2 changes: 1 addition & 1 deletion providers/process/package.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

const AbstractProcessor = require('./abstractProcessor')

const supportedTypes = ['npm', 'crate', 'maven', 'nuget', 'gem', 'go', 'pod', 'pypi', 'composer', 'deb']
const supportedTypes = ['npm', 'conda', 'crate', 'maven', 'nuget', 'gem', 'go', 'pod', 'pypi', 'composer', 'deb']

class PackageProcessor extends AbstractProcessor {
shouldFetch() {
Expand Down
2 changes: 1 addition & 1 deletion providers/process/source.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// SPDX-License-Identifier: MIT

const AbstractProcessor = require('./abstractProcessor')
const supportedTypes = ['git', 'sourcearchive', 'debsrc']
const supportedTypes = ['git', 'sourcearchive', 'debsrc', 'condasrc']

class SourceProcessor extends AbstractProcessor {
shouldFetch() {
Expand Down
Loading