large update

wanghui5801 · Nov 21, 2024 · 1fb480f · 1fb480f
1 parent f446a6a
commit 1fb480f
Show file tree

Hide file tree

Showing 13 changed files with 1,110 additions and 113 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2023 Pang
+Copyright (c) 2024 Hui Wang
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,4 @@
+include LICENSE
+include README.md
+include requirements-dev.txt
+recursive-include tests *.py
diff --git a/README.md b/README.md
@@ -1,54 +1,136 @@
 # Unsupervised Merge
 
-A simple Python package to cluster one-dimention series, following my working paper.
+A simple Python package for one-dimensional data clustering, implementing various clustering algorithms including traditional and novel approaches.
 
 ## Installation
 
-install the package using pip:
+Install the package using pip:
 
 ```
 pip install usmerge
 ```
 
-## SOM-K Cluster
+## Features
 
-![manmaid](som-k.png)
+This package provides multiple one-dimensional clustering methods:
 
-The steps are over here.
+- Equal Width Binning (equal_wid_merge)
+- Equal Frequency Binning (equal_fre_merge)
+- K-means Clustering (kmeans_merge)
+- SOM-K Clustering (som_k_merge)
+- Fuzzy C-Means (fcm_merge)
+- Kernel Density Based (kernel_density_merge)
+- Information Theoretic (information_merge)
+- Gaussian Mixture (gaussian_mixture_merge)
+- Hierarchical Density (hierarchical_density_merge)
 
-- Implement the SOM algorithm. Enter the data to be clustered into the SOM network and train. Because only moderately accurate clustering results are needed, training time can be greatly reduced. Algorithm convergence is not necessary.
+## Usage
 
-- After the training has concluded, the self-organization net- work makes each node in the output layer a nerve cell sen- sitive to a particular pattern. The inward star-like (Hu et al., 2002) weighting vector corresponding to each node becomes the center vector of each input pattern.
+### Data Format
+The package accepts various input formats:
+- pandas Series/DataFrame
+- numpy array
+- Python list/tuple
+- Any iterable of numbers
 
-- Use the inward star-like weighting vector obtained in (2) as the initial clustering center and implement the K-means clustering algorithm.
+### Basic Usage Examples
 
-- Obtain the SOM-K derived clusters and conduct relevant analysis.
-
-# Usage
-Note that you should import data with "dataframe" format, and then get a series from the dataframe, it could be the right format of series.
+1. Equal Width Binning:
+```python
+from usmerge import equal_wid_merge
+labels, edges = equal_wid_merge(data, n=3)
+```
 
-For example
+2. Equal Frequency Binning:
+```python
+from usmerge import equal_fre_merge
+labels, edges = equal_fre_merge(data, n=3)
 ```
-import pandas as pd
-ex = pd.read_excel("10.xlsx",converters={'证券代码':str})
-data = ex['显性']
+
+3. K-means Clustering:
+```python
+from usmerge import kmeans_merge
+labels, edges = kmeans_merge(data, n=3, max_iter=100)
 ```
 
-If you want to use som-k cluster.
+### Advanced Usage
 
-```
+1. SOM-K Clustering:
+```python
 from usmerge import som_k_merge
-result = som_k_merge(data,3,sig=0.5,lr=0.5,echo=1000)
+labels, edges = som_k_merge(data, n=3, sigma=0.5, learning_rate=0.5, epochs=1000)
 ```
 
-Of course, you could ignore the parameter(sig, lr, echo), I have initialized thiese parameters, but you could change if you want.
+2. Fuzzy C-Means:
+```python
+from usmerge import fcm_merge
+labels, edges = fcm_merge(data, n=3, m=2.0, max_iter=100, epsilon=1e-6)
+```
 
-Here is the whole method of this packages.
+3. Kernel Density Based:
+```python
+from usmerge import kernel_density_merge
+labels, edges = kernel_density_merge(data, n=3, bandwidth=None)
 ```
-from usmerge import equal_wid_merge,equal_fre_merge,kmeans_merge,som_k_merge
+
+### Return Values
+All clustering methods return two values:
+- labels: List of cluster labels for each data point
+- edges: List of cluster boundaries
+
+## Example Analysis
+
+```python
+import numpy as np
+import matplotlib.pyplot as plt
+from usmerge import som_k_merge, fcm_merge, kmeans_merge, hierarchical_density_merge
+
+# Generate synthetic data with three clear clusters
+np.random.seed(42)
+data = np.concatenate([
+    np.random.normal(0, 0.3, 50),    # First cluster
+    np.random.normal(5, 0.4, 50),    # Second cluster
+    np.random.normal(10, 0.3, 50)    # Third cluster
+])
+
+# Compare different clustering methods
+methods = {
+    'SOM-K': som_k_merge(data, n=3, sigma=0.5, learning_rate=0.5, epochs=1000),
+    'FCM': fcm_merge(data, n=3, m=2.0, max_iter=100),
+    'K-means': kmeans_merge(data, n=3),
+    'Hierarchical Density': hierarchical_density_merge(data, n=3)
+}
+
+# Visualize results
+plt.figure(figsize=(15, 4))
+for i, (name, (labels, edges)) in enumerate(methods.items(), 1):
+    plt.subplot(1, 4, i)
+    plt.scatter(data, np.zeros_like(data), c=labels, cmap='viridis')
+    plt.title(f'{name} Clustering')
+    # Plot cluster boundaries
+    for edge in edges:
+        plt.axvline(x=edge, color='r', linestyle='--', alpha=0.5)
+    plt.ylim(-0.5, 0.5)
+
+plt.tight_layout()
+plt.show()
 ```
 
+## Parameters Guide
+
+Each clustering method has its own set of parameters:
+
+- SOM-K: `sigma` (neighborhood size), `learning_rate` (learning rate), `epochs` (iterations)
+- FCM: `m` (fuzziness), `max_iter`, `epsilon` (convergence threshold)
+- Kernel Density: `bandwidth` (kernel width)
+- Information Theoretic: `alpha` (compression-accuracy trade-off)
+- Gaussian Mixture: `max_iter`, `epsilon` (convergence threshold)
+- Hierarchical Density: `min_cluster_size` (minimum points per cluster)
 
+## Contributing
 
+Feel free to contribute to this project by submitting issues or pull requests.
 
+## License
 
+MIT License
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,12 @@
+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[tool.black]
+line-length = 88
+target-version = ['py36', 'py37', 'py38', 'py39']
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra -q"
+testpaths = ["tests"]
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,4 @@
+pytest>=7.0.0
+black>=22.0.0
+flake8>=4.0.0
+mypy>=0.950
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,10 @@
+[flake8]
+max-line-length = 88
+extend-ignore = E203
+exclude = .git,__pycache__,build,dist,*.egg-info
+
+[mypy]
+python_version = 3.6
+warn_return_any = True
+warn_unused_configs = True
+disallow_untyped_defs = True
diff --git a/setup.py b/setup.py
@@ -6,13 +6,14 @@
 
 setup(
     name="usmerge",
-    version="0.1.1",
+    version="0.1.2",
     description="A simple package to merge one-dimension data by unsupervised method",
     author="HuiWang",
     author_email="huiw1128@gmail.com",
     packages=find_packages(),
     install_requires=[
-        "pandas","minisom","numpy","sklearn"
+        "pandas>=1.0.0",
+        "numpy>=1.18.0"
     ],
     classifiers=[
         "Development Status :: 3 - Alpha",

diff --git a/som-k.png b/som-k.png
diff --git a/tests/test_usmerge.py b/tests/test_usmerge.py
@@ -0,0 +1,67 @@
+import numpy as np
+import pytest
+from usmerge import (
+    equal_wid_merge,
+    equal_fre_merge,
+    kmeans_merge,
+    som_k_merge,
+    fcm_merge,
+    kernel_density_merge,
+    information_merge,
+    gaussian_mixture_merge,
+    hierarchical_density_merge
+)
+import pandas as pd
+
+@pytest.fixture
+def sample_data():
+    np.random.seed(42)
+    return np.concatenate([
+        np.random.normal(0, 0.3, 50),
+        np.random.normal(5, 0.4, 50),
+        np.random.normal(10, 0.3, 50)
+    ])
+
+def test_equal_width_binning(sample_data):
+    labels, edges = equal_wid_merge(sample_data, n=3)
+    assert len(labels) == len(sample_data)
+    assert len(edges) == 4
+    assert all(isinstance(x, int) for x in labels)
+    assert all(0 <= x < 3 for x in labels)
+
+def test_fcm_merge(sample_data):
+    labels, edges = fcm_merge(sample_data, n=3)
+    assert len(labels) == len(sample_data)
+    assert len(edges) == 4
+    assert all(isinstance(x, int) for x in labels)
+    assert all(0 <= x < 3 for x in labels)
+
+def test_kmeans_merge(sample_data):
+    labels, edges = kmeans_merge(sample_data, n=3)
+    assert len(labels) == len(sample_data)
+    assert len(edges) == 4
+    assert all(isinstance(x, int) for x in labels)
+    assert all(0 <= x < 3 for x in labels)
+
+def test_information_merge(sample_data):
+    labels, edges = information_merge(sample_data, n=3)
+    assert len(labels) == len(sample_data)
+    assert len(edges) == 4
+    assert all(isinstance(x, int) for x in labels)
+    assert all(0 <= x < 3 for x in labels)
+
+def test_input_formats():
+    data = [1.0, 2.0, 3.0, 4.0, 5.0]
+
+    # Test different input formats
+    pd_series = pd.Series(data)
+    pd_df = pd.DataFrame(data)
+    np_array = np.array(data)
+
+    labels1, _ = equal_wid_merge(pd_series, n=2)
+    labels2, _ = equal_wid_merge(pd_df, n=2)
+    labels3, _ = equal_wid_merge(np_array, n=2)
+
+    assert len(labels1) == len(data)
+    assert len(labels2) == len(data)
+    assert len(labels3) == len(data)
diff --git a/usmerge/__init__.py b/usmerge/__init__.py
@@ -1,5 +1,15 @@
-from .usmerge import equal_wid_merge,equal_fre_merge,kmeans_merge,som_k_merge
+from .usmerge import (
+    equal_wid_merge,
+    equal_fre_merge,
+    kmeans_merge,
+    som_k_merge,
+    fcm_merge,
+    kernel_density_merge,
+    information_merge,
+    gaussian_mixture_merge,
+    hierarchical_density_merge
+)
 
-__version__ = "0.1.1"
+__version__ = "0.1.2"
 __author__ = "HuiWang"
 __email__ = "huiw1128@gmail.com"
diff --git a/usmerge/__pycache__/__init__.cpython-38.pyc b/usmerge/__pycache__/__init__.cpython-38.pyc
diff --git a/usmerge/__pycache__/usmerge.cpython-38.pyc b/usmerge/__pycache__/usmerge.cpython-38.pyc