Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add diffusion map dimensionality reduction and simple Principal Component Analysis #17

Merged
merged 24 commits into from
Jul 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
32602b5
Add first draft of spectral clustering algorithm
bytesnake Mar 5, 2020
bc4913d
Add LOBPCG eigensolver
bytesnake Mar 10, 2020
7087362
Add more code to LOBPCG
bytesnake Mar 11, 2020
2da63b4
Complete port of LOBPCG
bytesnake Mar 13, 2020
df713bd
Start debugging of LOBPCG
bytesnake Mar 13, 2020
e4fede3
Add tests for LOBPCG
bytesnake Mar 13, 2020
c735724
Add linfa-reduction library for dimensionality reduction
bytesnake Mar 18, 2020
a675424
Repair example for diffusion map dimensionality reduction
bytesnake Mar 18, 2020
3c977ea
Add simple principal component analysis
bytesnake Mar 19, 2020
58e4d81
Remove hyperparameter file from PCA
bytesnake Mar 19, 2020
3b12413
Add kernel traits to unify dense and sparse matrices
bytesnake Mar 22, 2020
fe56fec
Only symmetric kernel functions are allowed
bytesnake Mar 23, 2020
1765834
Use kernel trait in diffusion maps
bytesnake Mar 23, 2020
fda3f6d
Apply D on rows, not columns
bytesnake Mar 23, 2020
aff9fa2
Use full eigenvalue decomposition for small problems
bytesnake Mar 24, 2020
917ce64
Add simple dense gaussian kernel
bytesnake Mar 24, 2020
15e5fe2
Update example
bytesnake Apr 1, 2020
7d48a78
Add dot kernel
bytesnake Apr 2, 2020
a86b70d
Add sparse gaussian kernel with `hnsw`
bytesnake Apr 3, 2020
7235a95
Fix sparse gaussian kernel
bytesnake Apr 4, 2020
133b612
Ensure that sparse kernel has elements for each point
bytesnake Apr 7, 2020
e384ba3
Simplify sparse kernels and add polynomial kernel
bytesnake Apr 8, 2020
9414bfb
Remove unnecessary debugging output
bytesnake Apr 8, 2020
7c29902
Comment on Principal Component Analysis
bytesnake Jul 13, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ rand_isaac = "0.2.0"
ndarray-npy = { version = "0.5", default-features = false }

[workspace]
members = ["linfa-clustering"]
members = [
"linfa-clustering",
"linfa-reduction"
]

[profile.release]
opt-level = 3
3 changes: 3 additions & 0 deletions linfa-clustering/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ categories = ["algorithms", "mathematics", "science"]
ndarray = { version = "0.13" , features = ["rayon", "serde", "approx"]}
ndarray-rand = "0.11"
ndarray-stats = "0.3"
ndarray-linalg = { version = "0.12", features = ["openblas"] }
sprs = "0.7"
serde = { version = "1", features = ["derive"] }
num-traits = "0.1.32"

[dev-dependencies]
rand_isaac = "0.2.0"
Expand Down
1 change: 1 addition & 0 deletions linfa-clustering/examples/kmeans.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,4 @@ fn main() {
)
.expect("Failed to write .npy file");
}

4 changes: 4 additions & 0 deletions linfa-clustering/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
//! Implementation choices, algorithmic details and a tutorial can be found [here](struct.KMeans.html).
//!
//! Check [here](https://github.com/LukeMathWalker/clustering-benchmarks) for extensive benchmarks against `scikit-learn`'s K-means implementation.

extern crate ndarray;
extern crate ndarray_linalg;

mod dbscan;
#[allow(clippy::new_ret_no_self)]
mod k_means;
Expand Down
32 changes: 32 additions & 0 deletions linfa-reduction/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[package]
name = "linfa-reduction"
version = "0.1.0"
authors = ["Lorenz Schmidt <bytesnake@mailbox.org>"]
description = "A collection of dimensionality reduction techniques"
edition = "2018"
license = "MIT/Apache-2.0"

repository = "https://github.com/LukeMathWalker/linfa"
readme = "README.md"

keywords = ["dimensionality reduction", "machine-learning", "linfa", "spectral", "unsupervised"]
categories = ["algorithms", "mathematics", "science"]

[dependencies]
ndarray = { version = "0.13" , features = ["rayon", "serde", "approx"]}
ndarray-rand = "0.11"
ndarray-stats = "0.3"
ndarray-linalg = { version = "0.12", features = ["openblas"] }
sprs = "0.7"
serde = { version = "1", features = ["derive"] }
num-traits = "0.1.32"
hnsw = "0.6"
space = "0.10"

[dev-dependencies]
rand_isaac = "0.2.0"
ndarray-npy = { version = "0.5", default-features = false }
criterion = "0.3"
serde_json = "1"
approx = "0.3"
linfa-clustering = { path = "../linfa-clustering" }
34 changes: 34 additions & 0 deletions linfa-reduction/examples/diffusion_map.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use linfa_reduction::utils::generate_convoluted_rings;
use linfa_reduction::kernel::{IntoKernel, SparsePolynomialKernel};
use ndarray_npy::write_npy;
use ndarray_rand::rand::SeedableRng;
use rand_isaac::Isaac64Rng;

// A routine K-means task: build a synthetic dataset, fit the algorithm on it
// and save both training data and predictions to disk.
fn main() {
// Our random number generator, seeded for reproducibility
let mut rng = Isaac64Rng::seed_from_u64(42);

// For each our expected centroids, generate `n` data points around it (a "blob")
let n = 3000;

// generate three convoluted rings
let dataset = generate_convoluted_rings(&[(0.0, 3.0), (10.0, 13.0), (20.0, 23.0)], n, &mut rng);

// generate sparse polynomial kernel with k = 14, c = 5 and d = 2
let diffusion_map = SparsePolynomialKernel::new(&dataset, 14, 5.0, 2.0)
.reduce_fixed(4);

// get embedding
let embedding = diffusion_map.embedding();

// Save to disk our dataset (and the cluster label assigned to each observation)
// We use the `npy` format for compatibility with NumPy
write_npy("diffusion_map_dataset.npy", dataset).expect("Failed to write .npy file");
write_npy(
"diffusion_map_embedding.npy",
embedding
)
.expect("Failed to write .npy file");
}
34 changes: 34 additions & 0 deletions linfa-reduction/examples/pca.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use linfa_clustering::generate_blobs;
use linfa_reduction::PrincipalComponentAnalysis;
use ndarray::array;
use ndarray_npy::write_npy;
use ndarray_rand::rand::SeedableRng;
use rand_isaac::Isaac64Rng;

// A routine K-means task: build a synthetic dataset, fit the algorithm on it
// and save both training data and predictions to disk.
fn main() {
// Our random number generator, seeded for reproducibility
let mut rng = Isaac64Rng::seed_from_u64(42);

// For each our expected centroids, generate `n` data points around it (a "blob")
let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],];
let n = 10;
let dataset = generate_blobs(n, &expected_centroids, &mut rng);

dbg!(&dataset);

let embedding = PrincipalComponentAnalysis::fit(dataset.clone(), 1)
.predict(&dataset);

dbg!(&embedding);

// Save to disk our dataset (and the cluster label assigned to each observation)
// We use the `npy` format for compatibility with NumPy
write_npy("pca_dataset.npy", dataset).expect("Failed to write .npy file");
write_npy(
"pca_embedding.npy",
embedding
)
.expect("Failed to write .npy file");
}
113 changes: 113 additions & 0 deletions linfa-reduction/src/diffusion_map/algorithms.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
use ndarray::{Array1, Array2};
use ndarray_linalg::{TruncatedOrder, lobpcg::LobpcgResult, lobpcg, eigh::EighInto, lapack::UPLO, Scalar};
use ndarray_rand::{rand_distr::Uniform, RandomExt};
use num_traits::NumCast;

use crate::Float;
use crate::kernel::{Kernel, IntoKernel};
use super::hyperparameters::DiffusionMapHyperParams;

pub struct DiffusionMap<A> {
hyperparameters: DiffusionMapHyperParams,
embedding: Array2<A>,
eigvals: Array1<A>
}

impl<A: Float> DiffusionMap<A> {
pub fn project(
hyperparameters: DiffusionMapHyperParams,
kernel: impl IntoKernel<A>
) -> Self {
// compute spectral embedding with diffusion map
let (embedding, eigvals) = compute_diffusion_map(kernel.into_kernel(), hyperparameters.steps(), 0.0, hyperparameters.embedding_size(), None);

DiffusionMap {
hyperparameters,
embedding,
eigvals
}
}

/// Return the hyperparameters used to train this spectral mode instance.
pub fn hyperparameters(&self) -> &DiffusionMapHyperParams {
&self.hyperparameters
}

/// Estimate the number of clusters in this embedding (very crude for now)
pub fn estimate_clusters(&self) -> usize {
let mean = self.eigvals.sum() / NumCast::from(self.eigvals.len()).unwrap();
self.eigvals.iter().filter(|x| *x > &mean).count() + 1
}

/// Return a copy of the eigenvalues
pub fn eigvals(&self) -> Array1<A> {
self.eigvals.clone()
}

pub fn embedding(&self) -> Array2<A> {
self.embedding.clone()
}
}

fn compute_diffusion_map<A: Float>(kernel: impl Kernel<A>, steps: usize, alpha: f32, embedding_size: usize, guess: Option<Array2<A>>) -> (Array2<A>, Array1<A>) {
assert!(embedding_size < kernel.size());

let d = kernel.sum()
.mapv(|x| x.recip());

let d2 = d.mapv(|x| x.powf(NumCast::from(0.5 + alpha).unwrap()));

// use full eigenvalue decomposition for small problem sizes
let (vals, mut vecs) = if kernel.size() < 5 * embedding_size + 1 {
let mut matrix = kernel.mul_similarity(&Array2::from_diag(&d).view());
matrix.gencolumns_mut().into_iter().zip(d.iter()).for_each(|(mut a,b)| a *= *b);

let (vals, vecs) = matrix.eigh_into(UPLO::Lower).unwrap();
let (vals, vecs) = (vals.slice_move(s![..; -1]), vecs.slice_move(s![.., ..; -1]));
(
vals.slice_move(s![1..embedding_size+1]).mapv(|x| Scalar::from_real(x)),
vecs.slice_move(s![..,1..embedding_size+1])
)
} else {
// calculate truncated eigenvalue decomposition
let x = guess.unwrap_or(
Array2::random((kernel.size(), embedding_size + 1), Uniform::new(0.0f64, 1.0))
.mapv(|x| NumCast::from(x).unwrap())
);

let result = lobpcg::lobpcg(|y| {
let mut y = y.to_owned();
y.genrows_mut().into_iter().zip(d2.iter()).for_each(|(mut a,b)| a *= *b);
let mut y = kernel.mul_similarity(&y.view());
y.genrows_mut().into_iter().zip(d2.iter()).for_each(|(mut a,b)| a *= *b);

y
}, x, |_| {}, None, 1e-15, 200, TruncatedOrder::Largest);

let (vals, vecs) = match result {
LobpcgResult::Ok(vals, vecs, _) | LobpcgResult::Err(vals, vecs, _, _) => { (vals, vecs)},
_ => panic!("Eigendecomposition failed!")
};


// cut away first eigenvalue/eigenvector
(
vals.slice_move(s![1..]),
vecs.slice_move(s![..,1..])
)
};

let d = d.mapv(|x| x.sqrt());

for (mut col, val) in vecs.genrows_mut().into_iter().zip(d.iter()) {
col *= *val;
}

let steps = NumCast::from(steps).unwrap();
for (mut vec, val) in vecs.gencolumns_mut().into_iter().zip(vals.iter()) {
vec *= val.powf(steps);
}

(vecs, vals)
}

48 changes: 48 additions & 0 deletions linfa-reduction/src/diffusion_map/hyperparameters.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
pub struct DiffusionMapHyperParams {
steps: usize,
embedding_size: usize,
}

pub struct DiffusionMapHyperParamsBuilder {
steps: usize,
embedding_size: usize,
}

impl DiffusionMapHyperParamsBuilder {
pub fn steps(mut self, steps: usize) -> Self {
self.steps = steps;

self
}

pub fn build(self) -> DiffusionMapHyperParams {
DiffusionMapHyperParams::build(
self.steps,
self.embedding_size,
)
}
}

impl DiffusionMapHyperParams {
pub fn new(embedding_size: usize) -> DiffusionMapHyperParamsBuilder {
DiffusionMapHyperParamsBuilder {
steps: 10,
embedding_size
}
}

pub fn steps(&self) -> usize {
self.steps
}

pub fn embedding_size(&self) -> usize {
self.embedding_size
}

pub fn build(steps: usize, embedding_size: usize) -> Self {
DiffusionMapHyperParams {
steps,
embedding_size,
}
}
}
5 changes: 5 additions & 0 deletions linfa-reduction/src/diffusion_map/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
mod algorithms;
mod hyperparameters;

pub use algorithms::*;
pub use hyperparameters::*;
59 changes: 59 additions & 0 deletions linfa-reduction/src/kernel/gaussian.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
use num_traits::NumCast;
use ndarray::{Array2, ArrayView1};
use sprs::CsMat;
use std::iter::Sum;

use crate::Float;
use crate::kernel::{IntoKernel, dense_from_fn, sparse_from_fn};

fn kernel<A: Float>(a: ArrayView1<A>, b: ArrayView1<A>, eps: A) -> A {
let distance = a.iter().zip(b.iter()).map(|(x,y)| (*x-*y)*(*x-*y))
.sum::<A>();

(-distance / eps).exp()
}

pub struct GaussianKernel<A> {
pub data: Array2<A>
}

impl<A: Float + Sum<A>> GaussianKernel<A> {
pub fn new(dataset: &Array2<A>, eps: f32) -> Self {
let eps = NumCast::from(eps).unwrap();

GaussianKernel {
data: dense_from_fn(dataset, |a,b| kernel(a,b,eps))
}
}
}

impl<A: Float> IntoKernel<A> for GaussianKernel<A> {
type IntoKer = Array2<A>;

fn into_kernel(self) -> Self::IntoKer {
self.data
}
}

pub struct SparseGaussianKernel<A> {
similarity: CsMat<A>
}

impl<A: Float> SparseGaussianKernel<A> {
pub fn new(dataset: &Array2<A>, k: usize, eps: f32) -> Self {
let eps = NumCast::from(eps).unwrap();

SparseGaussianKernel {
similarity: sparse_from_fn(dataset, k, |a,b| kernel(a,b,eps))
}
}
}

impl<A: Float> IntoKernel<A> for SparseGaussianKernel<A> {
type IntoKer = CsMat<A>;

fn into_kernel(self) -> Self::IntoKer {
self.similarity
}
}

Loading