Skip to content

Commit

Permalink
sorting example
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Apr 13, 2017
1 parent 4333937 commit 3a3e02e
Showing 1 changed file with 139 additions and 119 deletions.
258 changes: 139 additions & 119 deletions pandas/tests/frame/test_sorting.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# -*- coding: utf-8 -*-

from __future__ import print_function

import random
import numpy as np

import pandas as pd
from pandas.compat import lrange
from pandas import (DataFrame, Series, MultiIndex, Timestamp,
date_range, NaT)
date_range, NaT, IntervalIndex)

from pandas.util.testing import (assert_series_equal,
assert_frame_equal,
Expand All @@ -19,45 +20,6 @@

class TestDataFrameSorting(tm.TestCase, TestData):

def test_sort_index(self):
# GH13496

frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
columns=['A', 'B', 'C', 'D'])

# axis=0 : sort rows by index labels
unordered = frame.loc[[3, 2, 4, 1]]
result = unordered.sort_index(axis=0)
expected = frame
assert_frame_equal(result, expected)

result = unordered.sort_index(ascending=False)
expected = frame[::-1]
assert_frame_equal(result, expected)

# axis=1 : sort columns by column names
unordered = frame.iloc[:, [2, 1, 3, 0]]
result = unordered.sort_index(axis=1)
assert_frame_equal(result, frame)

result = unordered.sort_index(axis=1, ascending=False)
expected = frame.iloc[:, ::-1]
assert_frame_equal(result, expected)

def test_sort_index_multiindex(self):
# GH13496

# sort rows by specified level of multi-index
mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC'))
df = DataFrame([[1, 2], [3, 4]], mi)

# MI sort, but no level: sort_level has no effect
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
df = DataFrame([[1, 2], [3, 4]], mi)
result = df.sort_index(sort_remaining=False)
expected = df.sort_index()
assert_frame_equal(result, expected)

def test_sort(self):
frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
columns=['A', 'B', 'C', 'D'])
Expand Down Expand Up @@ -151,21 +113,6 @@ def test_sort_values_inplace(self):
expected = frame.sort_values(by=['A', 'B'], ascending=False)
assert_frame_equal(sorted_df, expected)

def test_sort_index_categorical_index(self):

df = (DataFrame({'A': np.arange(6, dtype='int64'),
'B': Series(list('aabbca'))
.astype('category', categories=list('cab'))})
.set_index('B'))

result = df.sort_index()
expected = df.iloc[[4, 0, 1, 5, 2, 3]]
assert_frame_equal(result, expected)

result = df.sort_index(ascending=False)
expected = df.iloc[[3, 2, 5, 1, 0, 4]]
assert_frame_equal(result, expected)

def test_sort_nan(self):
# GH3917
nan = np.nan
Expand Down Expand Up @@ -291,8 +238,86 @@ def test_stable_descending_multicolumn_sort(self):
kind='mergesort')
assert_frame_equal(sorted_df, expected)

def test_sort_datetimes(self):

# GH 3461, argsort / lexsort differences for a datetime column
df = DataFrame(['a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'],
columns=['A'],
index=date_range('20130101', periods=9))
dts = [Timestamp(x)
for x in ['2004-02-11', '2004-01-21', '2004-01-26',
'2005-09-20', '2010-10-04', '2009-05-12',
'2008-11-12', '2010-09-28', '2010-09-28']]
df['B'] = dts[::2] + dts[1::2]
df['C'] = 2.
df['A1'] = 3.

df1 = df.sort_values(by='A')
df2 = df.sort_values(by=['A'])
assert_frame_equal(df1, df2)

df1 = df.sort_values(by='B')
df2 = df.sort_values(by=['B'])
assert_frame_equal(df1, df2)

def test_frame_column_inplace_sort_exception(self):
s = self.frame['A']
with assertRaisesRegexp(ValueError, "This Series is a view"):
s.sort_values(inplace=True)

cp = s.copy()
cp.sort_values() # it works!

def test_sort_nat_values_in_int_column(self):

# GH 14922: "sorting with large float and multiple columns incorrect"

# cause was that the int64 value NaT was considered as "na". Which is
# only correct for datetime64 columns.

int_values = (2, int(NaT))
float_values = (2.0, -1.797693e308)

df = DataFrame(dict(int=int_values, float=float_values),
columns=["int", "float"])

df_reversed = DataFrame(dict(int=int_values[::-1],
float=float_values[::-1]),
columns=["int", "float"],
index=[1, 0])

# NaT is not a "na" for int64 columns, so na_position must not
# influence the result:
df_sorted = df.sort_values(["int", "float"], na_position="last")
assert_frame_equal(df_sorted, df_reversed)

df_sorted = df.sort_values(["int", "float"], na_position="first")
assert_frame_equal(df_sorted, df_reversed)

# reverse sorting order
df_sorted = df.sort_values(["int", "float"], ascending=False)
assert_frame_equal(df_sorted, df)

# and now check if NaT is still considered as "na" for datetime64
# columns:
df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT],
float=float_values), columns=["datetime", "float"])

df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")],
float=float_values[::-1]),
columns=["datetime", "float"],
index=[1, 0])

df_sorted = df.sort_values(["datetime", "float"], na_position="first")
assert_frame_equal(df_sorted, df_reversed)

df_sorted = df.sort_values(["datetime", "float"], na_position="last")
assert_frame_equal(df_sorted, df_reversed)


class TestDataFrameSortIndexKinds(tm.TestCase, TestData):

def test_sort_index_multicolumn(self):
import random
A = np.arange(5).repeat(20)
B = np.tile(np.arange(5), 20)
random.shuffle(A)
Expand Down Expand Up @@ -448,78 +473,73 @@ def test_sort_index_level(self):
res = df.sort_index(level=['A', 'B'], sort_remaining=False)
assert_frame_equal(df, res)

def test_sort_datetimes(self):

# GH 3461, argsort / lexsort differences for a datetime column
df = DataFrame(['a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'],
columns=['A'],
index=date_range('20130101', periods=9))
dts = [Timestamp(x)
for x in ['2004-02-11', '2004-01-21', '2004-01-26',
'2005-09-20', '2010-10-04', '2009-05-12',
'2008-11-12', '2010-09-28', '2010-09-28']]
df['B'] = dts[::2] + dts[1::2]
df['C'] = 2.
df['A1'] = 3.

df1 = df.sort_values(by='A')
df2 = df.sort_values(by=['A'])
assert_frame_equal(df1, df2)

df1 = df.sort_values(by='B')
df2 = df.sort_values(by=['B'])
assert_frame_equal(df1, df2)

def test_frame_column_inplace_sort_exception(self):
s = self.frame['A']
with assertRaisesRegexp(ValueError, "This Series is a view"):
s.sort_values(inplace=True)

cp = s.copy()
cp.sort_values() # it works!
def test_sort_index_categorical_index(self):

def test_sort_nat_values_in_int_column(self):
df = (DataFrame({'A': np.arange(6, dtype='int64'),
'B': Series(list('aabbca'))
.astype('category', categories=list('cab'))})
.set_index('B'))

# GH 14922: "sorting with large float and multiple columns incorrect"
result = df.sort_index()
expected = df.iloc[[4, 0, 1, 5, 2, 3]]
assert_frame_equal(result, expected)

# cause was that the int64 value NaT was considered as "na". Which is
# only correct for datetime64 columns.
result = df.sort_index(ascending=False)
expected = df.iloc[[3, 2, 5, 1, 0, 4]]
assert_frame_equal(result, expected)

int_values = (2, int(NaT))
float_values = (2.0, -1.797693e308)
def test_sort_index(self):
# GH13496

df = DataFrame(dict(int=int_values, float=float_values),
columns=["int", "float"])
frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
columns=['A', 'B', 'C', 'D'])

df_reversed = DataFrame(dict(int=int_values[::-1],
float=float_values[::-1]),
columns=["int", "float"],
index=[1, 0])
# axis=0 : sort rows by index labels
unordered = frame.loc[[3, 2, 4, 1]]
result = unordered.sort_index(axis=0)
expected = frame
assert_frame_equal(result, expected)

# NaT is not a "na" for int64 columns, so na_position must not
# influence the result:
df_sorted = df.sort_values(["int", "float"], na_position="last")
assert_frame_equal(df_sorted, df_reversed)
result = unordered.sort_index(ascending=False)
expected = frame[::-1]
assert_frame_equal(result, expected)

df_sorted = df.sort_values(["int", "float"], na_position="first")
assert_frame_equal(df_sorted, df_reversed)
# axis=1 : sort columns by column names
unordered = frame.iloc[:, [2, 1, 3, 0]]
result = unordered.sort_index(axis=1)
assert_frame_equal(result, frame)

# reverse sorting order
df_sorted = df.sort_values(["int", "float"], ascending=False)
assert_frame_equal(df_sorted, df)
result = unordered.sort_index(axis=1, ascending=False)
expected = frame.iloc[:, ::-1]
assert_frame_equal(result, expected)

# and now check if NaT is still considered as "na" for datetime64
# columns:
df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT],
float=float_values), columns=["datetime", "float"])
def test_sort_index_multiindex(self):
# GH13496

df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")],
float=float_values[::-1]),
columns=["datetime", "float"],
index=[1, 0])
# sort rows by specified level of multi-index
mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC'))
df = DataFrame([[1, 2], [3, 4]], mi)

df_sorted = df.sort_values(["datetime", "float"], na_position="first")
assert_frame_equal(df_sorted, df_reversed)
# MI sort, but no level: sort_level has no effect
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
df = DataFrame([[1, 2], [3, 4]], mi)
result = df.sort_index(sort_remaining=False)
expected = df.sort_index()
assert_frame_equal(result, expected)

df_sorted = df.sort_values(["datetime", "float"], na_position="last")
assert_frame_equal(df_sorted, df_reversed)
def test_sort_index_intervalindex(self):
# this is a de-facto sort via unstack
# confirming that we sort in the order of the bins
y = Series(np.random.randn(100))
x1 = Series(np.sign(np.random.randn(100)))
x2 = pd.cut(Series(np.random.randn(100)),
bins=[-3, -0.5, 0, 0.5, 3])
model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])

result = model.groupby(['X1', 'X2']).mean().unstack()
expected = IntervalIndex.from_tuples(
[(-3.0, -0.5), (-0.5, 0.0),
(0.0, 0.5), (0.5, 3.0)],
closed='right')
result = result.columns.levels[1].categories
tm.assert_index_equal(result, expected)

0 comments on commit 3a3e02e

Please sign in to comment.