Skip to content
This repository has been archived by the owner on Aug 12, 2020. It is now read-only.

Commit

Permalink
feat: support --raw-leaves
Browse files Browse the repository at this point in the history
Goes some way towards fixing ipfs/js-ipfs#1432 - will need follow up PRs for js-ipfs-mfs and js-ipfs itself (🔜).

There are three ways of importing a file we need to support and each will end up with slightly different DAG structure.

ipfs add will result in a balanced DAG with leaf nodes that are unixfs nodes of type file
ipfs files write results in a trickle DAG with leaf nodes that are unixfs nodes of type raw
ipfs add --raw-leaves and ipfs files write --raw-leaves have the balanced/trickle DAG of above, but the leaf nodes are chunks of file data not wrapped in protobufs.
In all cases above the root node is a unixfs file node with a v0 CID, unless you specify --cid-version=1.

This PR:

Changes meaning of existing rawLeaves argument. Now means the leaf node is just data - a chunk of the file, previously it was meant a unixfs node with type raw. So far the only code using this is js-ipfs-mfs so changing it shouldn't be too disruptive.
Adds a leafType option which can be file or raw - when --raw-leaves is false, this is what the unixfs leaf type will be.
Uses CIDv1 for raw leaves with the codec raw
  • Loading branch information
achingbrain committed Jul 17, 2018
1 parent 41b8ce5 commit 9105db1
Show file tree
Hide file tree
Showing 8 changed files with 217 additions and 46 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,8 @@ The input's file paths and directory structure will be preserved in the [`dag-pb
- `onlyHash` (boolean, defaults to false): Only chunk and hash - do not write to disk
- `hashAlg` (string): multihash hashing algorithm to use
- `cidVersion` (integer, default 0): the CID version to use when storing the data (storage keys are based on the CID, _including_ it's version)
- `rawLeafNodes` (boolean, defaults to false): When a file would span multiple DAGNodes, if this is true the leaf nodes will be marked as `raw` `unixfs` nodes
- `rawLeaves` (boolean, defaults to false): When a file would span multiple DAGNodes, if this is true the leaf nodes will not be wrapped in `UnixFS` protobufs and will instead contain the raw file bytes
- `leafType` (string, defaults to `'file'`) what type of UnixFS node leaves should be - can be `'file'` or `'raw'` (ignored when `rawLeaves` is `true`)

### Exporter

Expand Down
75 changes: 51 additions & 24 deletions src/builder/builder.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ const parallel = require('async/parallel')
const waterfall = require('async/waterfall')
const dagPB = require('ipld-dag-pb')
const CID = require('cids')
const multihash = require('multihashing-async')

const reduce = require('./reduce')

Expand All @@ -17,7 +18,10 @@ const defaultOptions = {
chunkerOptions: {
maxChunkSize: 262144
},
rawLeafNodes: false
rawLeaves: false,
hashAlg: 'sha2-256',
leafType: 'file',
cidVersion: 0
}

module.exports = function (createChunker, ipld, createReducer, _options) {
Expand Down Expand Up @@ -62,15 +66,13 @@ module.exports = function (createChunker, ipld, createReducer, _options) {
waterfall([
(cb) => DAGNode.create(d.marshal(), [], options.hashAlg, cb),
(node, cb) => {
if (options.onlyHash) return cb(null, node)

let cid = new CID(node.multihash)

if (options.cidVersion === 1) {
cid = cid.toV1()
if (options.onlyHash) {
return cb(null, node)
}

ipld.put(node, { cid }, (err) => cb(err, node))
ipld.put(node, {
cid: new CID(options.cidVersion, 'dag-pb', node.multihash)
}, (err) => cb(err, node))
}
], (err, node) => {
if (err) {
Expand All @@ -97,7 +99,6 @@ module.exports = function (createChunker, ipld, createReducer, _options) {

let previous
let count = 0
const leafType = options.rawLeafNodes ? 'raw' : 'file'

pull(
file.content,
Expand All @@ -108,30 +109,56 @@ module.exports = function (createChunker, ipld, createReducer, _options) {
}
return Buffer.from(chunk)
}),
pull.map(buffer => new UnixFS(leafType, buffer)),
pull.asyncMap((fileNode, callback) => {
DAGNode.create(fileNode.marshal(), [], options.hashAlg, (err, node) => {
callback(err, { DAGNode: node, fileNode: fileNode })
pull.asyncMap((buffer, callback) => {
if (options.rawLeaves) {
return multihash(buffer, options.hashAlg, (error, hash) => {
if (error) {
return callback(error)
}

return callback(null, {
multihash: hash,
size: buffer.length,
leafSize: buffer.length,
cid: new CID(1, 'raw', hash),
data: buffer
})
})
}

const file = new UnixFS(options.leafType, buffer)

DAGNode.create(file.marshal(), [], options.hashAlg, (err, node) => {
if (err) {
return callback(err)
}

callback(null, {
multihash: node.multihash,
size: node.size,
leafSize: file.fileSize(),
cid: new CID(options.cidVersion, 'dag-pb', node.multihash),
data: node
})
})
}),
pull.asyncMap((leaf, callback) => {
if (options.onlyHash) return callback(null, leaf)

let cid = new CID(leaf.DAGNode.multihash)

if (options.cidVersion === 1) {
cid = cid.toV1()
if (options.onlyHash) {
return callback(null, leaf)
}

ipld.put(leaf.DAGNode, { cid }, (err) => callback(err, leaf))
ipld.put(leaf.data, {
cid: leaf.cid
}, (error) => callback(error, leaf))
}),
pull.map((leaf) => {
return {
path: file.path,
multihash: leaf.DAGNode.multihash,
size: leaf.DAGNode.size,
leafSize: leaf.fileNode.fileSize(),
name: ''
multihash: leaf.multihash,
size: leaf.size,
leafSize: leaf.leafSize,
name: '',
cid: leaf.cid
}
}),
through( // mark as single node if only one single node
Expand Down
47 changes: 31 additions & 16 deletions src/builder/reduce.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ module.exports = function (file, ipld, options) {
if (leaves.length === 1 && leaves[0].single && options.reduceSingleLeafToSelf) {
const leaf = leaves[0]

if (!options.rawLeafNodes) {
if (options.leafType === 'file' && !options.rawLeaves) {
return callback(null, {
path: file.path,
multihash: leaf.multihash,
Expand All @@ -23,19 +23,22 @@ module.exports = function (file, ipld, options) {
})
}

// we are using raw leaf nodes, this file only has one node but it'll be marked raw
// so convert it back to a file node
// we're using raw leaf nodes so we convert the node into a UnixFS `file` node.
return waterfall([
(cb) => ipld.get(new CID(leaf.multihash), cb),
(cb) => ipld.get(leaf.cid, cb),
(result, cb) => {
const meta = UnixFS.unmarshal(result.value.data)
const fileNode = new UnixFS('file', meta.data)
const data = result.value.data
const fileNode = new UnixFS('file', data)

DAGNode.create(fileNode.marshal(), [], options.hashAlg, (err, node) => {
cb(err, { DAGNode: node, fileNode: fileNode })
})
},
(result, cb) => {
if (options.onlyHash) {
return cb(null, result)
}

let cid = new CID(result.DAGNode.multihash)

if (options.cidVersion === 1) {
Expand All @@ -46,10 +49,11 @@ module.exports = function (file, ipld, options) {
},
(result, cb) => {
cb(null, {
path: file.path,
multihash: result.DAGNode.multihash,
size: result.DAGNode.size,
leafSize: result.fileNode.fileSize(),
name: ''
name: leaf.name
})
}
], callback)
Expand All @@ -61,23 +65,34 @@ module.exports = function (file, ipld, options) {
const links = leaves.map((leaf) => {
f.addBlockSize(leaf.leafSize)

return new DAGLink(leaf.name, leaf.size, leaf.multihash)
let cid = leaf.cid

if (!cid) {
// we are an intermediate node
cid = new CID(options.cidVersion, 'dag-pb', leaf.multihash)
}

return new DAGLink(leaf.name, leaf.size, cid.buffer)
})

waterfall([
(cb) => DAGNode.create(f.marshal(), links, options.hashAlg, cb),
(node, cb) => {
if (options.onlyHash) return cb(null, node)

let cid = new CID(node.multihash)
const cid = new CID(options.cidVersion, 'dag-pb', node.multihash)

if (options.cidVersion === 1) {
cid = cid.toV1()
if (options.onlyHash) {
return cb(null, {
node, cid
})
}

ipld.put(node, { cid }, (err) => cb(err, node))
ipld.put(node, {
cid
}, (err) => cb(err, {
node, cid
}))
}
], (err, node) => {
], (err, {node, cid}) => {
if (err) {
callback(err)
return // early
Expand All @@ -86,7 +101,7 @@ module.exports = function (file, ipld, options) {
const root = {
name: '',
path: file.path,
multihash: node.multihash,
multihash: cid.buffer,
size: node.size,
leafSize: f.fileSize()
}
Expand Down
10 changes: 10 additions & 0 deletions src/exporter/file.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ function streamBytes (dag, node, fileSize, offset, length) {

function getData ({ node, start }) {
try {
if (Buffer.isBuffer(node)) {
// this is a raw node
return extractDataFromBlock(node, start, offset, end)
}

const file = UnixFS.unmarshal(node.data)

if (!file.data) {
Expand All @@ -80,6 +85,11 @@ function streamBytes (dag, node, fileSize, offset, length) {
let streamPosition = 0

function visitor ({ node }) {
if (Buffer.isBuffer(node)) {
// this is a raw node
return pull.empty()
}

const file = UnixFS.unmarshal(node.data)
const nodeHasData = Boolean(file.data && file.data.length)

Expand Down
20 changes: 19 additions & 1 deletion src/importer/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,29 @@ const chunkers = {

const defaultOptions = {
chunker: 'fixed',
rawLeafNodes: false
rawLeaves: false,
hashOnly: false,
cidVersion: 0,
hash: null,
leafType: 'file',
hashAlg: 'sha2-256'
}

module.exports = function (ipld, _options) {
const options = Object.assign({}, defaultOptions, _options)

if (options.cidVersion > 0 && _options.rawLeaves === undefined) {
// if the cid version is 1 or above, use raw leaves as this is
// what go does.
options.rawLeaves = true
}

if (_options && _options.hash !== undefined && _options.rawLeaves === undefined) {
// if a non-default hash alg has been specified, use raw leaves as this is
// what go does.
options.rawLeaves = true
}

const Chunker = chunkers[options.chunker]
assert(Chunker, 'Unknkown chunker named ' + options.chunker)

Expand Down
31 changes: 31 additions & 0 deletions test/exporter.js
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,37 @@ module.exports = (repo) => {
)
})

it('exports a large file > 5mb imported with raw leaves', function (done) {
this.timeout(30 * 1000)

pull(
pull.values([{
path: '200Bytes.txt',
content: pull.values([bigFile])
}]),
importer(ipld, {
rawLeaves: true
}),
pull.collect(collected)
)

function collected (err, files) {
expect(err).to.not.exist()
expect(files.length).to.equal(1)

pull(
exporter(files[0].multihash, ipld),
pull.collect((err, files) => {
expect(err).to.not.exist()

expect(bs58.encode(files[0].hash)).to.equal('QmQLTvhjmSa7657mKdSfTjxFBdwxmK8n9tZC9Xdp9DtxWY')

fileEql(files[0], bigFile, done)
})
)
}
})

it('returns an empty stream for dir', (done) => {
const hash = 'QmUNLLsPACCz1vLxQVkXqqLX5R1X345qqfHbsf67hvA3Nn'

Expand Down
32 changes: 32 additions & 0 deletions test/helpers/collect-leaf-cids.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
'use strict'

const pull = require('pull-stream')
const traverse = require('pull-traverse')
const CID = require('cids')

module.exports = (ipld, multihash, callback) => {
pull(
traverse.depthFirst(new CID(multihash), (cid) => {
return pull(
pull.values([cid]),
pull.asyncMap((cid, callback) => {
ipld.get(cid, (error, result) => {
callback(error, !error && result.value)
})
}),
pull.asyncMap((node, callback) => {
if (!node.links) {
return callback()
}

return callback(
null, node.links.map(link => new CID(link.multihash))
)
}),
pull.filter(Boolean),
pull.flatten()
)
}),
pull.collect(callback)
)
}
Loading

0 comments on commit 9105db1

Please sign in to comment.