Skip to content

Commit

Permalink
[v3.0.2] Fix hyperloglog + seedrandom typings (#73)
Browse files Browse the repository at this point in the history
* fix seedrandom typing after update

* fix seedrandom typing after update

* fix hyperloglog

* prettier

* use next/* branches

* use original seedrandom.PRNG type
  • Loading branch information
folkvir authored Jun 11, 2024
1 parent ae96027 commit e841a7f
Show file tree
Hide file tree
Showing 8 changed files with 596 additions and 473 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/npm_test_doc.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: 🔎 Tests
on:
push:
branches: [ master, develop ]
branches: [ master, develop, next/* ]
pull_request:
branches: [ master, develop ]
branches: [ master, develop, next/* ]
jobs:
ubuntu_build:
runs-on: ubuntu-latest
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "bloom-filters",
"version": "3.0.1",
"version": "3.0.2",
"description": "JS implementation of probabilistic data structures: Bloom Filter (and its derived), HyperLogLog, Count-Min Sketch, Top-K and MinHash",
"main": "dist/api.js",
"type": "commonjs",
Expand Down Expand Up @@ -52,7 +52,7 @@
"@types/lodash.eq": "^4.0.X",
"@types/lodash.indexof": "^4.0.X",
"@types/node": "^17.0.17",
"@types/seedrandom": "^3.0.2",
"@types/seedrandom": "^3.0.8",
"@types/xxhashjs": "^0.2.X",
"@typescript-eslint/eslint-plugin": "^5.11.0",
"@typescript-eslint/parser": "^5.11.0",
Expand Down
22 changes: 5 additions & 17 deletions src/base-filter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,35 +22,23 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

import seedrandom from 'seedrandom'
import seedrandom, {PRNG} from 'seedrandom'
import Hashing from './hashing/hashing'
import {getDefaultSeed} from './utils'

/**
* Exported prng type because it is not from seedrandom
* Orignal type can be found in: @types/seedrandom
*/
export interface prng {
(): number
double(): number
int32(): number
quick(): number
state(): seedrandom.State
}

/**
* A base class for implementing probailistic filters
* @author Thomas Minier
* @author Arnaud Grall
*/
export default abstract class BaseFilter {
public _seed: number
public _rng: prng
public _rng: PRNG
public _hashing: Hashing

constructor() {
this._seed = getDefaultSeed()
this._rng = seedrandom(`${this._seed}`) as prng
this._rng = seedrandom(`${this._seed}`)
this._hashing = new Hashing()
}

Expand All @@ -67,14 +55,14 @@ export default abstract class BaseFilter {
*/
public set seed(seed: number) {
this._seed = seed
this._rng = seedrandom(`${this._seed}`) as prng
this._rng = seedrandom(`${this._seed}`)
}

/**
* Get a function used to draw random number
* @return A factory function used to draw random integer
*/
public get random(): prng {
public get random() {
return this._rng
}

Expand Down
4 changes: 2 additions & 2 deletions src/bloom/scalable-bloom-filter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ SOFTWARE.
*/

import ClassicFilter from '../interfaces/classic-filter'
import BaseFilter, {prng} from '../base-filter'
import BaseFilter from '../base-filter'
import {AutoExportable, Field, Parameter} from '../exportable'
import {HashableInput} from '../utils'
import PartitionBloomFilter from './partitioned-bloom-filter'
Expand Down Expand Up @@ -118,7 +118,7 @@ export default class ScalableBloomFilter
*/
public set seed(seed: number) {
this._seed = seed
this._rng = seedrandom(`${this._seed}`) as prng
this._rng = seedrandom(`${this._seed}`)
this._filters.forEach((filter: PartitionBloomFilter) => {
filter.seed = this.seed
})
Expand Down
101 changes: 61 additions & 40 deletions src/sketch/hyperloglog.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

import XXH from 'xxhashjs'
import BaseFilter from '../base-filter'
import {AutoExportable, Field, Parameter} from '../exportable'
import {HashableInput, allocateArray} from '../utils'

// 2^32, computed as a constant as we use it a lot in the HyperLogLog algorithm
const TWO_POW_32 = Math.pow(2, 32)
const TWO_POW_32 = 2 ** 32

/**
* Estimlate the bias-correction constant, denoted alpha in the algorithm, based on the number of registers.
Expand All @@ -36,15 +37,17 @@ const TWO_POW_32 = Math.pow(2, 32)
* @return The estimated bias-correction constant
*/
function computeAlpha(m: number): number {
switch (m) {
case 16:
return 0.673
case 32:
return 0.697
case 64:
return 0.709
default:
return 0.7213 / (1.0 + 1.079 / m)
if (m < 16) {
return 1
} else if (m < 32) {
return 0.673
} else if (m < 64) {
return 0.697
} else if (m < 128) {
return 0.709
} else {
// >= 128
return 0.7213 / (1.0 + 1.079 / m)
}
}

Expand Down Expand Up @@ -85,8 +88,11 @@ export default class HyperLogLog extends BaseFilter {
*/
constructor(@Parameter('_nbRegisters') nbRegisters: number) {
super()
if ((nbRegisters & (nbRegisters - 1)) !== 0) {
throw new Error('The number of registers should be a power of 2')
}
this._nbRegisters = nbRegisters
this._nbBytesPerHash = Math.round(Math.log2(nbRegisters))
this._nbBytesPerHash = Math.ceil(Math.log2(nbRegisters))
this._correctionBias = computeAlpha(nbRegisters)
this._registers = allocateArray(this._nbRegisters, 0)
}
Expand All @@ -103,23 +109,28 @@ export default class HyperLogLog extends BaseFilter {
* @param element - Element to add
*/
public update(element: HashableInput): void {
// const hashedValue = Buffer.from(hashAsString(element, this.seed))
const hashedValue = this._hashing.hashAsInt(element, this.seed).toString(2)
const registerIndex =
1 + parseInt(hashedValue.slice(0, this._nbBytesPerHash - 1), 2)
const hashedValue = XXH.h64(element, this.seed)
.toString(2)
.padStart(64, '0')
const k = 64 - this._nbBytesPerHash
const registerIndex = parseInt(hashedValue.slice(k), 2)
// find the left most 1-bit in the second part of the buffer
const secondPart = hashedValue.slice(this._nbBytesPerHash)
let posLeftMost = 0
while (
secondPart[posLeftMost] !== '1' &&
posLeftMost < secondPart.length - 1
) {
posLeftMost++
const second = hashedValue.slice(0, k)
let leftmost_pos = k - 1
let found = false
let i = 0
while (!found && i < second.length) {
if (second[i] === '1') {
found = true
leftmost_pos = i
} else {
i++
}
}
// update the register
this._registers[registerIndex] = Math.max(
this._registers[registerIndex],
posLeftMost
leftmost_pos
)
}

Expand All @@ -129,28 +140,38 @@ export default class HyperLogLog extends BaseFilter {
*/
public count(round = false): number {
// Use the standard HyperLogLog estimator
const harmonicMean = this._registers.reduce(
const Z = this._registers.reduce(
(acc: number, value: number) => acc + Math.pow(2, -value),
0
)
let estimation =
(this._correctionBias * Math.pow(this._nbRegisters, 2)) / harmonicMean

// use linear counting to correct the estimation if E < 5m/2 and some registers are set to zero
/*if (estimation < ((5/2) * this._nbRegisters) && this._registers.some(value => value === 0)) {
const nbZeroRegisters = this._registers.filter(value => value === 0).length
estimation = this._nbRegisters * Math.log(this._nbRegisters / nbZeroRegisters)
}*/

// correct the estimation for very large registers
if (estimation > TWO_POW_32 / 30) {
estimation = -TWO_POW_32 * Math.log(1 - estimation / TWO_POW_32)
const raw_estimation =
(this._correctionBias * this._nbRegisters * this._nbRegisters * 2) / Z

let corrected_estimation

if (raw_estimation <= (5 / 2) * this._nbRegisters) {
// use linear counting to correct the estimation if E < 5m/2 and some registers are set to zero
const V = this._registers.filter(value => value === 0).length
if (V > 0) {
// small range correction: linear counting
corrected_estimation =
this._nbRegisters * Math.log(this._nbRegisters / V)
} else {
corrected_estimation = raw_estimation
}
} else if (raw_estimation <= TWO_POW_32 / 30) {
// middle range correction; no correction
corrected_estimation = raw_estimation
} else {
// raw_estimation > TWO_POW_32 / 30
// large range correction
corrected_estimation =
-TWO_POW_32 * Math.log(1 - raw_estimation / TWO_POW_32)
}
// round if required
if (round) {
estimation = Math.round(estimation)
return Math.round(corrected_estimation)
}
return estimation
return corrected_estimation
}

/**
Expand All @@ -173,7 +194,7 @@ export default class HyperLogLog extends BaseFilter {
)
}
const newSketch = new HyperLogLog(this.nbRegisters)
for (let i = 0; i < this.nbRegisters - 1; i++) {
for (let i = 0; i < this.nbRegisters; i++) {
newSketch._registers[i] = Math.max(
this._registers[i],
other._registers[i]
Expand Down
68 changes: 49 additions & 19 deletions test/hyperloglog-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,52 @@ const {HyperLogLog} = require('../dist/api.js')
describe('HyperLogLog', () => {
describe('#update', () => {
it('should support update and cardinality estimations (count) operations', () => {
const nbDistinct = 100
const sketch = new HyperLogLog(110)
const m = 2 ** 8
const n = 10e4
const sketch = new HyperLogLog(m)
// populate the sketch with some values
for (let i = 0; i < 10e3; i++) {
sketch.update(`${i % nbDistinct}`)
for (let i = 0; i < n; i++) {
sketch.update(i.toString())
}
sketch
.count(true)
.should.be.closeTo(nbDistinct, nbDistinct * sketch.accuracy())
})

// Citation:
// "Let σ ≈ 1.04/√m represent the standard error; the estimates provided by HYPERLOGLOG
// are expected to be within σ, 2σ, 3σ of the exact count in respectively 65%, 95%, 99% of all
// the cases"
const exact_count = sketch.count()
const relative_error = sketch.accuracy()

let error
const relative_errors = [
relative_error,
2 * relative_error,
3 * relative_error,
]

for (const relative_err of relative_errors) {
try {
Math.abs(n - exact_count).should.be.below(n * relative_err)
error = false
break
} catch (e) {
error = e
}
}

if (error) {
throw new Error(
`should be withing σ, 2σ or 3σ: ${relative_errors
.map(e => e * n)
.toString()}: ${error.toString()}`
)
}
}).timeout(0)
})

describe('#merge', () => {
it('should peforms the union of two HyperLogLog sketches', () => {
const first = new HyperLogLog(10)
const second = new HyperLogLog(10)
const first = new HyperLogLog(8)
const second = new HyperLogLog(8)
first.update('alice')
first.update('bob')
second.update('carol')
Expand All @@ -59,8 +89,8 @@ describe('HyperLogLog', () => {
})

it('should reject the union of two sketches with different number of registers', done => {
const first = new HyperLogLog(10)
const second = new HyperLogLog(20)
const first = new HyperLogLog(8)
const second = new HyperLogLog(32)
try {
first.merge(second)
done(
Expand All @@ -76,8 +106,8 @@ describe('HyperLogLog', () => {

describe('#equals', () => {
it('should returns True when two HyperLogLog sketches are equals', () => {
const first = new HyperLogLog(10)
const second = new HyperLogLog(10)
const first = new HyperLogLog(8)
const second = new HyperLogLog(8)
first.update('alice')
first.update('bob')
second.update('alice')
Expand All @@ -86,14 +116,14 @@ describe('HyperLogLog', () => {
})

it('should returns False when two sketches have different number of registers', () => {
const first = new HyperLogLog(10)
const second = new HyperLogLog(11)
const first = new HyperLogLog(8)
const second = new HyperLogLog(16)
first.equals(second).should.equal(false)
})

it('should returns False when two sketches have different content in their registers', () => {
const first = new HyperLogLog(10)
const second = new HyperLogLog(11)
const first = new HyperLogLog(8)
const second = new HyperLogLog(16)
first.update('alice')
first.update('bob')
second.update('carol')
Expand All @@ -103,7 +133,7 @@ describe('HyperLogLog', () => {
})

describe('#saveAsJSON', () => {
const sketch = new HyperLogLog(10)
const sketch = new HyperLogLog(8)
sketch.update('alice')
sketch.update('bob')

Expand Down
4 changes: 2 additions & 2 deletions test/iblt-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ describe('Invertible Bloom Lookup Tables', () => {
invalids.forEach(json => {
;(function () {
InvertibleBloomFilter.fromJSON(json)
}.should.throw(Error))
}).should.throw(Error)
})
})

Expand All @@ -187,7 +187,7 @@ describe('Invertible Bloom Lookup Tables', () => {
_elements: [],
_seed: 1,
})
}.should.not.throw(Error))
}).should.not.throw(Error)
})
})

Expand Down
Loading

0 comments on commit e841a7f

Please sign in to comment.