PyPandas

PyPandas, a data cleaning framework for Spark

Features

Outlier detection
Scaling and normalization
Text cleaning

Installation

pip install pypandas

Usage

Outlier Detection

from pyspark.sql import SparkSession
from pypandas.outlier import KMeansOutlierRemover

# Create Spark Session
spark = SparkSession.builder.getOrCreate()

# Load your dataframe here
data = load_data()

df.count()
# 1508177  

df.show()
'''
+------------+-------------+
|Initial Cost|Total Est Fee|
+------------+-------------+
|         0.0|        130.0|
|         0.0|        320.0|
|         0.0|          0.0|
|         0.0|        100.0|
|         0.0|       1120.0|
|         0.0|        200.0|
|         0.0|        200.0|
|         0.0|        200.0|
|         0.0|        100.0|
|         0.0|      2230.28|
|         0.0|      1330.98|
|         0.0|        100.0|
|    125000.0|       992.21|
|         0.0|      2223.46|
|         0.0|      2223.46|
|         0.0|        260.0|
|         0.0|        408.0|
|         0.0|          0.0|
|         0.0|        538.0|
|         0.0|          0.0|
+------------+-------------+
only showing top 20 rows
'''

# Instantiate Outlier Remover from the factory
# Available choices are "kmeans", "bisectingkmeans", and "gaussian"
km = OutlierRemover.factory("kmeans")

# Check (default) parameters
km.k
# 3

# Set parameters based on user's knowledge on the dataset
km.set_param(k=5)
km.k
# 5

# Perform KMeans clustering
km.fit(df, ["Initial Cost", "Total Est Fee"])

# Get clustering summary
s = km.summary()
s.show()
'''
+-------------+-------+--------------------+-------------------------------+    
|cluster index|   size|      cluster center|avg(distance to cluster center)|
+-------------+-------+--------------------+-------------------------------+
|            0|1506910|   86153.79855969973|             115623.10175225197|
|            1|     11|3.7746609272727275E8|             9.78881126369113E7|
|            2|      3|        9.55543933E8|            3.703218131733947E7|
|            3|   1206|1.6625062420529801E7|              7289962.118664694|
|            4|     47|1.1278780606382978E8|           3.6478952700282976E7|
+-------------+-------+--------------------+-------------------------------+
'''

# Show a particular cluster
cluster3 = km.get_cluster(3)
cluster3.show()
'''
+----------+--------------------------+------------+-------------+
|prediction|distance to cluster center|Initial Cost|Total Est Fee|
+----------+--------------------------+------------+-------------+
|         3|         4775293.760860619|     1.185E7|     122143.5|
|         3|         2216883.564916434| 1.4408275E7|     148501.2|
|         3|         8191841.731979836|   8433633.0|      86958.7|
|         3|         8186474.485411539|   8439000.0|      87010.2|
|         3|         2627175.097229941|    1.3998E7|     144815.9|
|         3|        2543986.5908025797| 1.4081188E7|     145258.1|
|         3|        1993081.6867797726|    1.4632E7|    160384.12|
|         3|         6555151.707938737| 1.0070235E7|     103944.8|
|         3|        3181787.7064663703|  1.344342E7|     138741.7|
|         3|         4030253.117367908|    1.2595E7|     129942.0|
|         3|         6375299.876071498|       2.3E7|     237113.5|
|         3|        2640317.4489083528| 1.9265215E7|     198653.3|
|         3|      3.3505087823614437E7| 5.0128347E7|     516757.2|
|         3|       1.987601568180299E7|      3.65E7|     376163.5|
|         3|          7292308.55701616|   9333200.0|      88484.5|
|         3|      1.4124048718706062E7| 3.0748338E7|     316928.2|
|         3|        2971534.1636515674| 1.3653663E7|     140849.7|
|         3|         4475199.270002893|      2.11E7|     217543.5|
|         3|         8049959.945984376|   8575506.0|      88546.3|
|         3|        509944.38674310147|  1.713495E7|     176759.0|
+----------+--------------------------+------------+-------------+
only showing top 20 rows
'''

# Filter out some outlier in cluster 3
km.filter(3, 10000000)

# Get filtered dataframe
newdf = km.get_dataframe()

# Show new count, 165 rows were filtered out
newdf.count()
# 1508012

Scaling and Normalization

from pyspark.sql import SparkSession
from pypandas.scale import standard_scale, min_max_scale, max_abs_scale, normalize, unpack_vector

# Create a dataframe
df = spark.createDataFrame([(1, 1.22, 2.34, 1.1, 3.5),\
                            (2, -0.23, 5.23, 4.22, 2.34),\
                            (3, 0.67, 1.34, -3.56, 7.45)],\
                            ["id","col1","col2","col3","col4"]) 
df.show()
'''
+---+-----+----+-----+----+
| id| col1|col2| col3|col4|
+---+-----+----+-----+----+
|  1| 1.22|2.34|  1.1| 3.5|
|  2|-0.23|5.23| 4.22|2.34|
|  3| 0.67|1.34|-3.56|7.45|
+---+-----+----+-----+----+
'''

# Use the standard_scale() function
# Users could optionally set the variable withStd and withMean
# withStd: True by default. Scales the data to unit standard deviation.
# withMean: False by default. Centers the data with mean before scaling. It will build a dense output, so take care when applying to sparse input.

scaled_df = standard_scale(df, ["col1", "col2", "col3", "col4"])
scaled_df.show(3,False)
'''
+---+-----+----+-----+----+-------------------------------------------------------------------------------+
|id |col1 |col2|col3 |col4|scaled features                                                                |
+---+-----+----+-----+----+-------------------------------------------------------------------------------+
|1  |1.22 |2.34|1.1  |3.5 |[1.6666521513105603,1.1583732592894616,0.28094763789611055,1.3064886711618937] |
|2  |-0.23|5.23|4.22 |2.34|[-0.31420491377166304,2.5890137376426856,1.0778173017468968,0.8734809972910946]|
|3  |0.67 |1.34|-3.56|7.45|[0.9152925749000619,0.6633419519008028,-0.9092487190092305,2.78095445718746]   |
+---+-----+----+-----+----+-------------------------------------------------------------------------------+
'''

# Use the min_max_scale() function
# Users could optionally set the variables min and max
# min: 0.0 by default. Lower bound after transformation, shared by all features.
# max: 1.0 by default. Upper bound after transformation, shared by all features.

scaled_df = min_max_scale(df,  ["col1", "col2", "col3", "col4"])
scaled_df.show(3,False)
'''
+---+-----+----+-----+----+---------------------------------------------------------------+
|id |col1 |col2|col3 |col4|scaled features                                                |
+---+-----+----+-----+----+---------------------------------------------------------------+
|1  |1.22 |2.34|1.1  |3.5 |[1.0,0.25706940874035983,0.5989717223650386,0.2270058708414873]|
|2  |-0.23|5.23|4.22 |2.34|[0.0,1.0,1.0,0.0]                                              |
|3  |0.67 |1.34|-3.56|7.45|[0.6206896551724138,0.0,0.0,1.0]                               |
+---+-----+----+-----+----+---------------------------------------------------------------+
'''

# Use the max_abs_scale() function

scaled_df = max_abs_scale(df,  ["col1", "col2", "col3", "col4"])
scaled_df.show(3,False)
'''
+---+-----+----+-----+----+----------------------------------------------------------------+
|id |col1 |col2|col3 |col4|scaled features                                                 |
+---+-----+----+-----+----+----------------------------------------------------------------+
|1  |1.22 |2.34|1.1  |3.5 |[1.0,0.4474187380497131,0.26066350710900477,0.4697986577181208] |
|2  |-0.23|5.23|4.22 |2.34|[-0.18852459016393444,1.0,1.0,0.3140939597315436]               |
|3  |0.67 |1.34|-3.56|7.45|[0.5491803278688525,0.25621414913957935,-0.8436018957345972,1.0]|
+---+-----+----+-----+----+----------------------------------------------------------------+
'''

# Use the normalize() function
# Users could optionally set p-norm variable
# p-norm: 2 by default, used for normalization

normalized_df = normalize(df, ["col1", "col2", "col3", "col4"])
normalized_df.show(3,False)
'''
+---+-----+----+-----+----+--------------------------------------------------------------------------------+
|id |col1 |col2|col3 |col4|normalized features                                                             |
+---+-----+----+-----+----+--------------------------------------------------------------------------------+
|1  |1.22 |2.34|1.1  |3.5 |[0.26995379041977813,0.5177802209690826,0.2434009585752098,0.7744575954665766]  |
|2  |-0.23|5.23|4.22 |2.34|[-0.032304836487058015,0.7345838905535367,0.592723521632108,0.328666597303112]  |
|3  |0.67 |1.34|-3.56|7.45|[0.07984081144863289,0.15968162289726578,-0.4242287891897508,0.8877821571527089]|
+---+-----+----+-----+----+--------------------------------------------------------------------------------+
'''

# Use the unpack_vector() function
# The function unpack_vector() enables user unpack the dense vector into columns.

unpack_df=unpack_vector(normalized_df,"normalized features",["norm col1", "norm col2", "norm col3", "norm col4"])
unpack_df.show(3,False)
'''
+---+-----+----+-----+----+--------------------------------------------------------------------------------+
|id |col1 |col2|col3 |col4|normalized features                                                             |
+---+-----+----+-----+----+--------------------------------------------------------------------------------+
|1  |1.22 |2.34|1.1  |3.5 |[0.26995379041977813,0.5177802209690826,0.2434009585752098,0.7744575954665766]  |
|2  |-0.23|5.23|4.22 |2.34|[-0.032304836487058015,0.7345838905535367,0.592723521632108,0.328666597303112]  |
|3  |0.67 |1.34|-3.56|7.45|[0.07984081144863289,0.15968162289726578,-0.4242287891897508,0.8877821571527089]|
+---+-----+----+-----+----+--------------------------------------------------------------------------------+
                            |
                            |
                            v
+---+-----+----+-----+----+-----------+----------+----------+----------+
|id |col1 |col2|col3 |col4|norm col1  |norm col2 |norm col3 |norm col4 |
+---+-----+----+-----+----+-----------+----------+----------+----------+
|1  |1.22 |2.34|1.1  |3.5 |0.2699538  |0.51778024|0.24340096|0.7744576 |
|2  |-0.23|5.23|4.22 |2.34|-0.03230484|0.7345839 |0.59272355|0.3286666 |
|3  |0.67 |1.34|-3.56|7.45|0.07984081 |0.15968162|-0.4242288|0.88778216|
+---+-----+----+-----+----+-----------+----------+----------+----------+
'''

Text Cleaning

from pyspark.sql import SparkSession
from pypandas.text_cleaner import clean_text, sub_with_pattern, TextCleaner

# Create Spark Session
spark = SparkSession.builder.getOrCreate()

# Setup data
data = [(
    '    You can find the cheapest good on https://www.amazon.com/.    ',
    'See, this iPhoneX only cost $20 dollar!!! ',
    ' What the ______ :)) '
)]

df = spark.createDataFrame(data, schema=['1', '2', '3'])

df.show()
'''
+--------------------+--------------------+--------------------+
|                   1|                   2|                   3|
+--------------------+--------------------+--------------------+
|    You can find ...|See, this iPhoneX...| What the ______ ...|
+--------------------+--------------------+--------------------+
'''

# General cleaning feature
# Clean all column with '*' column
clean_text(df, '*').collect()
# [Row(
#     1='You can find the cheapest good on _url_',
#     2='See this iPhoneX only cost _number_ dollar',
#     3='What the _ '
# )]

# Clean specific columns
clean_text(df, ['1', '2']).collect()
# [Row(
#     1='You can find the cheapest good on _url_',
#     2='See this iPhoneX only cost _number_ dollar',
#     3=' What the ______ :)) '
# )]

# Customize your own cleaning ways
# For example, removing dollar sign
sub_with_pattern(df, ['2'], '\$', '').select('2').collect()
# [Row(2='See, this iPhoneX only cost 20 dollar!!! ')]

# You can apply your own regular expression in our framework
# For example, replacing consecutive white spaces to a single white space
sub_with_pattern(df, ['1'], ' +', ' ').select('1').collect()
# [Row(1=' You can find the cheapest good on https://www.amazon.com/. ')]

# To customize your cleaning process, you can use TextCleaner
cleaner = TextCleaner()
# First, remove leading space
cleaner.register_re_sub('^ +', '')
# Then, remove trailing space
cleaner.register_re_sub(' +$', '')
# Clean dataframe
cleaner.clean(df, ['1']).select('1').collect()
# [Row(1='You can find the cheapest good on https://www.amazon.com/.')]

Name		Name	Last commit message	Last commit date
Latest commit History 291 Commits
doc		doc
experiment		experiment
pypandas		pypandas
script		script
.gitignore		.gitignore
LICENSE		LICENSE
README.md		README.md
setup.cfg		setup.cfg
setup.py		setup.py

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

PyPandas

Features

Installation

Usage

Outlier Detection

Scaling and Normalization

Text Cleaning

About

Releases

Packages

Contributors 3

Languages

License

shtsai/PyPandas

Folders and files

Latest commit

History

Repository files navigation

PyPandas

Features

Installation

Usage

Outlier Detection

Scaling and Normalization

Text Cleaning

About

Topics

Resources

License

Stars

Watchers

Forks

Releases

Packages 0

Contributors 3

Languages

Packages