import pandas as pd
5 Series Deepdive
Source: https://github.com/mattharrison/effective_pandas_book
= 'https://github.com/mattharrison/datasets/raw/master/data/' \
url 'vehicles.csv.zip'
= pd.read_csv(url)
df = df.city08
city_mpg = df.highway08 highway_mpg
/var/folders/70/7wmmf6t55cb84bfx9g1c1k1m0000gn/T/ipykernel_6435/3976941641.py:1: DtypeWarning: Columns (68,70,71,72,73,74,76,79) have mixed types. Specify dtype option on import or set low_memory=False.
df = pd.read_csv(url)
city_mpg
0 19
1 9
2 23
3 10
4 17
..
41139 19
41140 20
41141 18
41142 18
41143 16
Name: city08, Length: 41144, dtype: int64
highway_mpg
0 25
1 14
2 33
3 12
4 23
..
41139 26
41140 28
41141 24
41142 24
41143 21
Name: highway08, Length: 41144, dtype: int64
len(dir(city_mpg))
419
5.1 Dunder Methods
2 + 4
6
2).__add__(4) (
6
+ highway_mpg)/2 (city_mpg
0 22.0
1 11.5
2 28.0
3 11.0
4 20.0
...
41139 22.5
41140 24.0
41141 21.0
41142 21.0
41143 18.5
Length: 41144, dtype: float64
5.2 Index Allignment
When you operate with two series, pandas will align the index before performing the operation. Aligning will take each index entry in the left series and match it up with every entry with the same name in the index of the right series.”
Because of index alignment, you will want to make sure that the indexes:
- Are unique (no duplicates)
- Are common to both series
= pd.Series([10, 20, 30], index=[1,2,2])
s1 = pd.Series([35, 44, 53], index=[2,2,4], name='s2')
s2 s1
1 10
2 20
2 30
dtype: int64
s2
2 35
2 44
4 53
Name: s2, dtype: int64
+ s2 s1
1 NaN
2 55.0
2 64.0
2 65.0
2 74.0
4 NaN
dtype: float64
=0) s1.add(s2, fill_value
1 10.0
2 55.0
2 64.0
2 65.0
2 74.0
4 53.0
dtype: float64
+
((city_mpg
highway_mpg)/ 2
)
0 22.0
1 11.5
2 28.0
3 11.0
4 20.0
...
41139 22.5
41140 24.0
41141 21.0
41142 21.0
41143 18.5
Length: 41144, dtype: float64
(city_mpg
.add(highway_mpg)2)
.div( )
0 22.0
1 11.5
2 28.0
3 11.0
4 20.0
...
41139 22.5
41140 24.0
41141 21.0
41142 21.0
41143 18.5
Length: 41144, dtype: float64
city_mpg.mean()
18.369045304297103
city_mpg.is_unique
False
city_mpg.is_monotonic_increasing
False
city_mpg.quantile()
17.0
.9) city_mpg.quantile(
24.0
.1, .5, .9]) city_mpg.quantile([
0.1 13.0
0.5 17.0
0.9 24.0
Name: city08, dtype: float64
(city_mpg20)
.gt(sum()
. )
10272
(city_mpg20)
.gt(100)
.mul(
.mean() )
24.965973167412017
'mean') city_mpg.agg(
18.369045304297103
import numpy as np
def second_to_last(s):
return s.iloc[-2]
'mean', np.var, max, second_to_last]) city_mpg.agg([
mean 18.369045
var 62.503036
max 150.000000
second_to_last 18.000000
Name: city08, dtype: float64
city_mpg.convert_dtypes()
0 19
1 9
2 23
3 10
4 17
..
41139 19
41140 20
41141 18
41142 18
41143 16
Name: city08, Length: 41144, dtype: Int64
'Int16') city_mpg.astype(
0 19
1 9
2 23
3 10
4 17
..
41139 19
41140 20
41141 18
41142 18
41143 16
Name: city08, Length: 41144, dtype: Int16
'Int8') city_mpg.astype(
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) ~/envs/menv/lib/python3.8/site-packages/pandas/core/arrays/integer.py in safe_cast(values, dtype, copy) 125 try: --> 126 return values.astype(dtype, casting="safe", copy=copy) 127 except TypeError as err: TypeError: Cannot cast array data from dtype('int64') to dtype('int8') according to the rule 'safe' The above exception was the direct cause of the following exception: TypeError Traceback (most recent call last) <ipython-input-35-15ba00a8d273> in <module> ----> 1 city_mpg.astype('Int8') ~/envs/menv/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors) 5813 else: 5814 # else, only a single dtype is given -> 5815 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) 5816 return self._constructor(new_data).__finalize__(self, method="astype") 5817 ~/envs/menv/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors) 416 417 def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T: --> 418 return self.apply("astype", dtype=dtype, copy=copy, errors=errors) 419 420 def convert( ~/envs/menv/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs) 325 applied = b.apply(f, **kwargs) 326 else: --> 327 applied = getattr(b, f)(**kwargs) 328 except (TypeError, NotImplementedError): 329 if not ignore_failures: ~/envs/menv/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors) 590 values = self.values 591 --> 592 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) 593 594 new_values = maybe_coerce_values(new_values) ~/envs/menv/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_array_safe(values, dtype, copy, errors) 1307 1308 try: -> 1309 new_values = astype_array(values, dtype, copy=copy) 1310 except (ValueError, TypeError): 1311 # e.g. astype_nansafe can fail on object-dtype of strings ~/envs/menv/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_array(values, dtype, copy) 1255 1256 else: -> 1257 values = astype_nansafe(values, dtype, copy=copy) 1258 1259 # in pandas we don't store numpy str dtypes, so convert to object ~/envs/menv/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna) 1103 # dispatch on extension dtype if needed 1104 if isinstance(dtype, ExtensionDtype): -> 1105 return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy) 1106 1107 elif not isinstance(dtype, np.dtype): # pragma: no cover ~/envs/menv/lib/python3.8/site-packages/pandas/core/arrays/integer.py in _from_sequence(cls, scalars, dtype, copy) 321 cls, scalars, *, dtype: Dtype | None = None, copy: bool = False 322 ) -> IntegerArray: --> 323 values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy) 324 return IntegerArray(values, mask) 325 ~/envs/menv/lib/python3.8/site-packages/pandas/core/arrays/integer.py in coerce_to_array(values, dtype, mask, copy) 229 values = safe_cast(values, dtype, copy=False) 230 else: --> 231 values = safe_cast(values, dtype, copy=False) 232 233 return values, mask ~/envs/menv/lib/python3.8/site-packages/pandas/core/arrays/integer.py in safe_cast(values, dtype, copy) 131 return casted 132 --> 133 raise TypeError( 134 f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}" 135 ) from err TypeError: cannot safely cast non-equivalent int64 to int8
'int64') np.iinfo(
iinfo(min=-9223372036854775808, max=9223372036854775807, dtype=int64)
'uint8') np.iinfo(
iinfo(min=0, max=255, dtype=uint8)
'float16') np.finfo(
finfo(resolution=0.001, min=-6.55040e+04, max=6.55040e+04, dtype=float16)
'float64') np.finfo(
finfo(resolution=1e-15, min=-1.7976931348623157e+308, max=1.7976931348623157e+308, dtype=float64)
city_mpg.nbytes
329152
'Int16').nbytes city_mpg.astype(
123432
= df.make
make make.nbytes
329152
make.memory_usage()
329280
=True) make.memory_usage(deep
2606395
(make'category')
.astype(=True)
.memory_usage(deep )
95888
str) city_mpg.astype(
0 19
1 9
2 23
3 10
4 17
..
41139 19
41140 20
41141 18
41142 18
41143 16
Name: city08, Length: 41144, dtype: object
'category') city_mpg.astype(
0 19
1 9
2 23
3 10
4 17
..
41139 19
41140 20
41141 18
41142 18
41143 16
Name: city08, Length: 41144, dtype: category
Categories (105, int64): [6, 7, 8, 9, ..., 137, 138, 140, 150]
= pd.Series(sorted(set(city_mpg)))
values = pd.CategoricalDtype(categories=values,
city_type =True)
ordered city_mpg.astype(city_type)
0 19
1 9
2 23
3 10
4 17
..
41139 19
41140 20
41141 18
41142 18
41143 16
Name: city08, Length: 41144, dtype: category
Categories (105, int64): [6 < 7 < 8 < 9 ... 137 < 138 < 140 < 150]
city_mpg.to_frame()
city08 | |
---|---|
0 | 19 |
1 | 9 |
2 | 23 |
3 | 10 |
4 | 17 |
... | ... |
41139 | 19 |
41140 | 20 |
41141 | 18 |
41142 | 18 |
41143 | 16 |
41144 rows × 1 columns
def gt20(val):
return val > 20
%%timeit
apply(gt20) city_mpg.
4.48 ms ± 49.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
20) city_mpg.gt(
76.7 µs ± 931 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
= df.make make
make
0 Alfa Romeo
1 Ferrari
2 Dodge
3 Dodge
4 Subaru
...
41139 Subaru
41140 Subaru
41141 Subaru
41142 Subaru
41143 Subaru
Name: make, Length: 41144, dtype: object
make.value_counts()
Chevrolet 4003
Ford 3371
Dodge 2583
GMC 2494
Toyota 2071
...
Volga Associated Automobile 1
Panos 1
Mahindra 1
Excalibur Autos 1
London Coach Co Inc 1
Name: make, Length: 136, dtype: int64
= make.value_counts().index[:5]
top5 def generalize_top5(val):
if val in top5:
return val
return 'Other'
apply(generalize_top5) make.
0 Other
1 Other
2 Dodge
3 Dodge
4 Other
...
41139 Other
41140 Other
41141 Other
41142 Other
41143 Other
Name: make, Length: 41144, dtype: object
='Other') make.where(make.isin(top5), other
0 Other
1 Other
2 Dodge
3 Dodge
4 Other
...
41139 Other
41140 Other
41141 Other
41142 Other
41143 Other
Name: make, Length: 41144, dtype: object
%%timeit
apply(generalize_top5) make.
11 ms ± 76.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
'Other') make.where(make.isin(top5),
1.65 ms ± 35.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
~make.isin(top5), other='Other') make.mask(
0 Other
1 Other
2 Dodge
3 Dodge
4 Other
...
41139 Other
41140 Other
41141 Other
41142 Other
41143 Other
Name: make, Length: 41144, dtype: object
= make.value_counts()
vc = vc.index[:5]
top5 = vc.index[:10]
top10 def generalize(val):
if val in top5:
return val
elif val in top10:
return 'Top10'
else:
return 'Other'
apply(generalize) make.
0 Other
1 Other
2 Dodge
3 Dodge
4 Other
...
41139 Other
41140 Other
41141 Other
41142 Other
41143 Other
Name: make, Length: 41144, dtype: object
(make'Top10')
.where(make.isin(top5), 'Other')
.where(make.isin(top10), )
0 Other
1 Other
2 Dodge
3 Dodge
4 Other
...
41139 Other
41140 Other
41141 Other
41142 Other
41143 Other
Name: make, Length: 41144, dtype: object
import numpy as np
np.select([make.isin(top5), make.isin(top10)],'Top10'], 'Other') [make,
array(['Other', 'Other', 'Dodge', ..., 'Other', 'Other', 'Other'],
dtype=object)
pd.Series(np.select([make.isin(top5), make.isin(top10)],'Top10'], 'Other'), index=make.index) [make,
0 Other
1 Other
2 Dodge
3 Dodge
4 Other
...
41139 Other
41140 Other
41141 Other
41142 Other
41143 Other
Length: 41144, dtype: object
= df.cylinders
cyl
(cyl
.isna()sum()
. )
206
= cyl.isna()
missing make.loc[missing]
7138 Nissan
7139 Toyota
8143 Toyota
8144 Ford
8146 Ford
...
34563 Tesla
34564 Tesla
34565 Tesla
34566 Tesla
34567 Tesla
Name: make, Length: 206, dtype: object
cyl[cyl.isna()]
7138 NaN
7139 NaN
8143 NaN
8144 NaN
8146 NaN
..
34563 NaN
34564 NaN
34565 NaN
34566 NaN
34567 NaN
Name: cylinders, Length: 206, dtype: float64
0).loc[7136:7141] cyl.fillna(
7136 6.0
7137 6.0
7138 0.0
7139 0.0
7140 6.0
7141 6.0
Name: cylinders, dtype: float64
= pd.Series([32, 40, None, 42, 39, 32])
temp temp
0 32.0
1 40.0
2 NaN
3 42.0
4 39.0
5 32.0
dtype: float64
temp.interpolate()
0 32.0
1 40.0
2 41.0
3 42.0
4 39.0
5 32.0
dtype: float64
446] city_mpg.loc[:
0 19
1 9
2 23
3 10
4 17
..
442 15
443 15
444 15
445 15
446 31
Name: city08, Length: 447, dtype: int64
(city_mpg446]
.loc[:=city_mpg.quantile(.05),
.clip(lower=city_mpg.quantile(.95))
upper )
0 19.0
1 11.0
2 23.0
3 11.0
4 17.0
...
442 15.0
443 15.0
444 15.0
445 15.0
446 27.0
Name: city08, Length: 447, dtype: float64
city_mpg.sort_values()
7901 6
34557 6
37161 6
21060 6
35887 6
...
34563 138
34564 140
32599 150
31256 150
33423 150
Name: city08, Length: 41144, dtype: int64
+ highway_mpg) / 2 (city_mpg.sort_values()
0 22.0
1 11.5
2 28.0
3 11.0
4 20.0
...
41139 22.5
41140 24.0
41141 21.0
41142 21.0
41143 18.5
Length: 41144, dtype: float64
city_mpg.sort_values().sort_index()
0 19
1 9
2 23
3 10
4 17
..
41139 19
41140 20
41141 18
41142 18
41143 16
Name: city08, Length: 41144, dtype: int64
city_mpg.drop_duplicates()
0 19
1 9
2 23
3 10
4 17
...
34364 127
34409 114
34564 140
34565 115
34566 104
Name: city08, Length: 105, dtype: int64
city_mpg.rank()
0 27060.5
1 235.5
2 35830.0
3 607.5
4 19484.0
...
41139 27060.5
41140 29719.5
41141 23528.0
41142 23528.0
41143 15479.0
Name: city08, Length: 41144, dtype: float64
='min') city_mpg.rank(method
0 25555.0
1 136.0
2 35119.0
3 336.0
4 17467.0
...
41139 25555.0
41140 28567.0
41141 21502.0
41142 21502.0
41143 13492.0
Name: city08, Length: 41144, dtype: float64
='dense') city_mpg.rank(method
0 14.0
1 4.0
2 18.0
3 5.0
4 12.0
...
41139 14.0
41140 15.0
41141 13.0
41142 13.0
41143 11.0
Name: city08, Length: 41144, dtype: float64
'Subaru', 'スバル') make.replace(
0 Alfa Romeo
1 Ferrari
2 Dodge
3 Dodge
4 スバル
...
41139 スバル
41140 スバル
41141 スバル
41142 スバル
41143 スバル
Name: make, Length: 41144, dtype: object
r'(Fer)ra(r.*)',
make.replace(=r'\2-other-\1', regex=True) value
0 Alfa Romeo
1 ri-other-Fer
2 Dodge
3 Dodge
4 Subaru
...
41139 Subaru
41140 Subaru
41141 Subaru
41142 Subaru
41143 Subaru
Name: make, Length: 41144, dtype: object
10) pd.cut(city_mpg,
0 (5.856, 20.4]
1 (5.856, 20.4]
2 (20.4, 34.8]
3 (5.856, 20.4]
4 (5.856, 20.4]
...
41139 (5.856, 20.4]
41140 (5.856, 20.4]
41141 (5.856, 20.4]
41142 (5.856, 20.4]
41143 (5.856, 20.4]
Name: city08, Length: 41144, dtype: category
Categories (10, interval[float64, right]): [(5.856, 20.4] < (20.4, 34.8] < (34.8, 49.2] < (49.2, 63.6] ... (92.4, 106.8] < (106.8, 121.2] < (121.2, 135.6] < (135.6, 150.0]]
0, 10, 20, 40, 70, 150]) pd.cut(city_mpg, [
0 (10, 20]
1 (0, 10]
2 (20, 40]
3 (0, 10]
4 (10, 20]
...
41139 (10, 20]
41140 (10, 20]
41141 (10, 20]
41142 (10, 20]
41143 (10, 20]
Name: city08, Length: 41144, dtype: category
Categories (5, interval[int64, right]): [(0, 10] < (10, 20] < (20, 40] < (40, 70] < (70, 150]]
10) pd.qcut(city_mpg,
0 (18.0, 20.0]
1 (5.999, 13.0]
2 (21.0, 24.0]
3 (5.999, 13.0]
4 (16.0, 17.0]
...
41139 (18.0, 20.0]
41140 (18.0, 20.0]
41141 (17.0, 18.0]
41142 (17.0, 18.0]
41143 (15.0, 16.0]
Name: city08, Length: 41144, dtype: category
Categories (10, interval[float64, right]): [(5.999, 13.0] < (13.0, 14.0] < (14.0, 15.0] < (15.0, 16.0] ... (18.0, 20.0] < (20.0, 21.0] < (21.0, 24.0] < (24.0, 150.0]]
10, labels=list(range(1,11))) pd.qcut(city_mpg,
0 7
1 1
2 9
3 1
4 5
..
41139 7
41140 7
41141 6
41142 6
41143 4
Name: city08, Length: 41144, dtype: category
Categories (10, int64): [1 < 2 < 3 < 4 ... 7 < 8 < 9 < 10]
= city_mpg.rename(make.to_dict())
city2 city2
Alfa Romeo 19
Ferrari 9
Dodge 23
Dodge 10
Subaru 17
..
Subaru 19
Subaru 20
Subaru 18
Subaru 18
Subaru 16
Name: city08, Length: 41144, dtype: int64
city2.index
Index(['Alfa Romeo', 'Ferrari', 'Dodge', 'Dodge', 'Subaru', 'Subaru', 'Subaru',
'Toyota', 'Toyota', 'Toyota',
...
'Saab', 'Saturn', 'Saturn', 'Saturn', 'Saturn', 'Subaru', 'Subaru',
'Subaru', 'Subaru', 'Subaru'],
dtype='object', length=41144)
= city_mpg.rename(make)
city2 city2
Alfa Romeo 19
Ferrari 9
Dodge 23
Dodge 10
Subaru 17
..
Subaru 19
Subaru 20
Subaru 18
Subaru 18
Subaru 16
Name: city08, Length: 41144, dtype: int64
'citympg') city2.rename(
Alfa Romeo 19
Ferrari 9
Dodge 23
Dodge 10
Subaru 17
..
Subaru 19
Subaru 20
Subaru 18
Subaru 18
Subaru 16
Name: citympg, Length: 41144, dtype: int64
city2.reset_index()
index | city08 | |
---|---|---|
0 | Alfa Romeo | 19 |
1 | Ferrari | 9 |
2 | Dodge | 23 |
3 | Dodge | 10 |
4 | Subaru | 17 |
... | ... | ... |
41139 | Subaru | 19 |
41140 | Subaru | 20 |
41141 | Subaru | 18 |
41142 | Subaru | 18 |
41143 | Subaru | 16 |
41144 rows × 2 columns
=True) city2.reset_index(drop
0 19
1 9
2 23
3 10
4 17
..
41139 19
41140 20
41141 18
41142 18
41143 16
Name: city08, Length: 41144, dtype: int64
'Subaru'] city2.loc[
Subaru 17
Subaru 21
Subaru 22
Subaru 19
Subaru 20
..
Subaru 19
Subaru 20
Subaru 18
Subaru 18
Subaru 16
Name: city08, Length: 885, dtype: int64
'Fisker'] city2.loc[
20
'Fisker']] city2.loc[[
Fisker 20
Name: city08, dtype: int64
'Ferrari', 'Lamborghini']] city2.loc[[
Ferrari 9
Ferrari 12
Ferrari 11
Ferrari 10
Ferrari 11
..
Lamborghini 6
Lamborghini 8
Lamborghini 8
Lamborghini 8
Lamborghini 8
Name: city08, Length: 357, dtype: int64
'Ferrari':'Lamborghini'] city2.loc[
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-98-ca8b99f7d7f7> in <module> ----> 1 city2.loc['Ferrari':'Lamborghini'] ~/envs/menv/lib/python3.8/site-packages/pandas/core/indexing.py in __getitem__(self, key) 929 930 maybe_callable = com.apply_if_callable(key, self.obj) --> 931 return self._getitem_axis(maybe_callable, axis=axis) 932 933 def _is_scalar_access(self, key: tuple): ~/envs/menv/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis) 1140 if isinstance(key, slice): 1141 self._validate_key(key, axis) -> 1142 return self._get_slice_axis(key, axis=axis) 1143 elif com.is_bool_indexer(key): 1144 return self._getbool_axis(key, axis=axis) ~/envs/menv/lib/python3.8/site-packages/pandas/core/indexing.py in _get_slice_axis(self, slice_obj, axis) 1174 1175 labels = obj._get_axis(axis) -> 1176 indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, slice_obj.step) 1177 1178 if isinstance(indexer, slice): ~/envs/menv/lib/python3.8/site-packages/pandas/core/indexes/base.py in slice_indexer(self, start, end, step, kind) 5684 slice(1, 3, None) 5685 """ -> 5686 start_slice, end_slice = self.slice_locs(start, end, step=step) 5687 5688 # return a slice ~/envs/menv/lib/python3.8/site-packages/pandas/core/indexes/base.py in slice_locs(self, start, end, step, kind) 5886 start_slice = None 5887 if start is not None: -> 5888 start_slice = self.get_slice_bound(start, "left") 5889 if start_slice is None: 5890 start_slice = 0 ~/envs/menv/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_slice_bound(self, label, side, kind) 5818 ) 5819 if isinstance(slc, np.ndarray): -> 5820 raise KeyError( 5821 f"Cannot get {side} slice bound for non-unique " 5822 f"label: {repr(original_label)}" KeyError: "Cannot get left slice bound for non-unique label: 'Ferrari'"
'Ferrari':'Lamborghini'] city2.sort_index().loc[
Ferrari 10
Ferrari 13
Ferrari 13
Ferrari 9
Ferrari 10
..
Lamborghini 12
Lamborghini 9
Lamborghini 8
Lamborghini 13
Lamborghini 8
Name: city08, Length: 11210, dtype: int64
"F":"J"] city2.sort_index().loc[
Federal Coach 15
Federal Coach 13
Federal Coach 13
Federal Coach 14
Federal Coach 13
..
Isuzu 15
Isuzu 15
Isuzu 15
Isuzu 27
Isuzu 18
Name: city08, Length: 9040, dtype: int64
= pd.Index(['Dodge'])
idx city2.loc[idx]
Dodge 23
Dodge 10
Dodge 12
Dodge 11
Dodge 11
..
Dodge 18
Dodge 17
Dodge 14
Dodge 14
Dodge 11
Name: city08, Length: 2583, dtype: int64
= pd.Index(['Dodge', 'Dodge'])
idx city2.loc[idx]
Dodge 23
Dodge 10
Dodge 12
Dodge 11
Dodge 11
..
Dodge 18
Dodge 17
Dodge 14
Dodge 14
Dodge 11
Name: city08, Length: 5166, dtype: int64
= city2 > 50
mask mask
Alfa Romeo False
Ferrari False
Dodge False
Dodge False
Subaru False
...
Subaru False
Subaru False
Subaru False
Subaru False
Subaru False
Name: city08, Length: 41144, dtype: bool
city2.loc[mask]
Nissan 81
Toyota 81
Toyota 81
Ford 74
Nissan 84
...
Tesla 140
Tesla 115
Tesla 104
Tesla 98
Toyota 55
Name: city08, Length: 236, dtype: int64
= pd.Series([1.00, 2.25, 3.99, .99, 2.79],
cost =['Gum', 'Cookie', 'Melon', 'Roll', 'Carrots'])
index= 1.10
inflation
(cost
.mul(inflation)lambda s_: s_ > 3]
.loc[ )
Melon 4.389
Carrots 3.069
dtype: float64
= pd.Series([1.00, 2.25, 3.99, .99, 2.79],
cost =['Gum', 'Cookie', 'Melon', 'Roll', 'Carrots'])
index= 1.10
inflation = cost > 3
mask
(cost
.mul(inflation)
.loc[mask] )
Melon 4.389
dtype: float64
def gt3(s):
return s > 3
= lambda s: s > 3 gt3
0] city2.iloc[
19
-1] city2.iloc[
16
0,1,-1]] city2.iloc[[
Alfa Romeo 19
Ferrari 9
Subaru 16
Name: city08, dtype: int64
0:5] city2.iloc[
Alfa Romeo 19
Ferrari 9
Dodge 23
Dodge 10
Subaru 17
Name: city08, dtype: int64
-8:] city2.iloc[
Saturn 21
Saturn 24
Saturn 21
Subaru 19
Subaru 20
Subaru 18
Subaru 18
Subaru 16
Name: city08, dtype: int64
= city2 > 50
mask city2.iloc[mask]
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-114-59298d41640c> in <module> 1 mask = city2 > 50 ----> 2 city2.iloc[mask] ~/envs/menv/lib/python3.8/site-packages/pandas/core/indexing.py in __getitem__(self, key) 929 930 maybe_callable = com.apply_if_callable(key, self.obj) --> 931 return self._getitem_axis(maybe_callable, axis=axis) 932 933 def _is_scalar_access(self, key: tuple): ~/envs/menv/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis) 1550 1551 if com.is_bool_indexer(key): -> 1552 self._validate_key(key, axis) 1553 return self._getbool_axis(key, axis=axis) 1554 ~/envs/menv/lib/python3.8/site-packages/pandas/core/indexing.py in _validate_key(self, key, axis) 1398 "is not available" 1399 ) -> 1400 raise ValueError( 1401 "iLocation based boolean indexing cannot use " 1402 "an indexable as a mask" ValueError: iLocation based boolean indexing cannot use an indexable as a mask
city2.iloc[mask.to_numpy()]
Nissan 81
Toyota 81
Toyota 81
Ford 74
Nissan 84
...
Tesla 140
Tesla 115
Tesla 104
Tesla 98
Toyota 55
Name: city08, Length: 236, dtype: int64
list(mask)] city2.iloc[
Nissan 81
Toyota 81
Toyota 81
Ford 74
Nissan 84
...
Tesla 140
Tesla 115
Tesla 104
Tesla 98
Toyota 55
Name: city08, Length: 236, dtype: int64
3) city2.head(
Alfa Romeo 19
Ferrari 9
Dodge 23
Name: city08, dtype: int64
3) city2.tail(
Subaru 18
Subaru 18
Subaru 16
Name: city08, dtype: int64
6, random_state=42) city2.sample(
Volvo 16
Mitsubishi 19
Buick 27
Jeep 15
Land Rover 13
Saab 17
Name: city08, dtype: int64
filter(items=['Ford', 'Subaru']) city2.
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-121-7de9bb7e417f> in <module> ----> 1 city2.filter(items=['Ford', 'Subaru']) ~/envs/menv/lib/python3.8/site-packages/pandas/core/generic.py in filter(self, items, like, regex, axis) 4980 if items is not None: 4981 name = self._get_axis_name(axis) -> 4982 return self.reindex(**{name: [r for r in items if r in labels]}) 4983 elif like: 4984 ~/envs/menv/lib/python3.8/site-packages/pandas/core/series.py in reindex(self, index, **kwargs) 4578 ) 4579 def reindex(self, index=None, **kwargs): -> 4580 return super().reindex(index=index, **kwargs) 4581 4582 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) ~/envs/menv/lib/python3.8/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs) 4816 4817 # perform the reindex on the axes -> 4818 return self._reindex_axes( 4819 axes, level, limit, tolerance, method, fill_value, copy 4820 ).__finalize__(self, method="reindex") ~/envs/menv/lib/python3.8/site-packages/pandas/core/generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy) 4837 4838 axis = self._get_axis_number(a) -> 4839 obj = obj._reindex_with_indexers( 4840 {axis: [new_index, indexer]}, 4841 fill_value=fill_value, ~/envs/menv/lib/python3.8/site-packages/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups) 4881 4882 # TODO: speed up on homogeneous DataFrame objects -> 4883 new_data = new_data.reindex_indexer( 4884 index, 4885 indexer, ~/envs/menv/lib/python3.8/site-packages/pandas/core/internals/managers.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy, consolidate, only_slice) 668 # some axes don't allow reindexing with dups 669 if not allow_dups: --> 670 self.axes[axis]._validate_can_reindex(indexer) 671 672 if axis >= self.ndim: ~/envs/menv/lib/python3.8/site-packages/pandas/core/indexes/base.py in _validate_can_reindex(self, indexer) 3783 # trying to reindex on an axis with duplicates 3784 if not self._index_as_unique and len(indexer): -> 3785 raise ValueError("cannot reindex from a duplicate axis") 3786 3787 def reindex( ValueError: cannot reindex from a duplicate axis
filter(like='rd') city2.
filter(regex='(Ford)|(Subaru)') city2.
'Missing', 'Ford']) city2.reindex([
0,0, 10, 20, 2_000_000]) city_mpg.reindex([
= pd.Series([10,20,30], index=['a', 'b', 'c'])
s1 = pd.Series([15,25,35], index=['b', 'c', 'd']) s2
s2
s2.reindex(s1.index)
Inspect the index.
Sort the index.
Set the index to monotonically increasing integers starting from 0.
Set the index to monotonically increasing integers starting from 0, then convert these to the string version. Save this a
s2
.Using
s2
, pull out the first 5 entries.Using
s2
, pull out the last 5 entries.Using
s2
, pull out one hundred entries starting at index position 10.Using
s2
, create a series with values with index entries'20'
,'10'
, and'2'
.
make
'string') make.astype(
'category') make.astype(
'Ford'.lower()
str.lower() make.
'Alfa Romeo'.find('A')
str.find('A') make.
str.extract(r'([^a-z A-Z])') make.
(makestr.extract(r'([^a-z A-Z])', expand=False)
.
.value_counts() )
= pd.Series(['0-10', '11-15', '11-15', '61-65', '46-50'])
age age
0 0-10
1 11-15
2 11-15
3 61-65
4 46-50
dtype: object
str.split('-') age.
0 [0, 10]
1 [11, 15]
2 [11, 15]
3 [61, 65]
4 [46, 50]
dtype: object
(agestr.split('-', expand=True)
.0]
.iloc[:,int)
.astype( )
0 0
1 11
2 11
3 61
4 46
Name: 0, dtype: int64
(agestr.slice(-2)
.int)
.astype( )
0 10
1 15
2 15
3 65
4 50
dtype: int64
(agestr[-2:]
.int)
.astype( )
0 10
1 15
2 15
3 65
4 50
dtype: int64
(agestr.split('-', expand=True)
.int)
.astype(='columns')
.mean(axis )
0 5.0
1 13.0
2 13.0
3 63.0
4 48.0
dtype: float64
import random
def between(row):
return random.randint(*row.values)
(agestr.split('-', expand=True)
.int)
.astype(apply(between, axis='columns')
. )
0 2
1 15
2 15
3 62
4 50
dtype: int64
str.replace('A', 'Å') make.
0 Ålfa Romeo
1 Ferrari
2 Dodge
3 Dodge
4 Subaru
...
41139 Subaru
41140 Subaru
41141 Subaru
41142 Subaru
41143 Subaru
Name: make, Length: 41144, dtype: object
'A', 'Å') make.replace(
0 Alfa Romeo
1 Ferrari
2 Dodge
3 Dodge
4 Subaru
...
41139 Subaru
41140 Subaru
41141 Subaru
41142 Subaru
41143 Subaru
Name: make, Length: 41144, dtype: object
'Audi': 'Åudi', 'Acura': 'Åcura',
make.replace({'Ashton Martin': 'Åshton Martin',
'Alfa Romeo': 'Ålfa Romeo'})
0 Ålfa Romeo
1 Ferrari
2 Dodge
3 Dodge
4 Subaru
...
41139 Subaru
41140 Subaru
41141 Subaru
41142 Subaru
41143 Subaru
Name: make, Length: 41144, dtype: object
'A', 'Å', regex=True) make.replace(
0 Ålfa Romeo
1 Ferrari
2 Dodge
3 Dodge
4 Subaru
...
41139 Subaru
41140 Subaru
41141 Subaru
41142 Subaru
41143 Subaru
Name: make, Length: 41144, dtype: object
Using a string column, lowercase the values.
Using a string column, slice out the first character.
Using a string column, slice out the last three characters.
Using a string column, create a series extracting the numeric values.
Using a string column, create a series extracting the non-ASCII values.
Using a string column, create a dataframe with the dummy columns for every character in the column.
= pd.Series(['2015-03-08 08:00:00+00:00',
col '2015-03-08 08:30:00+00:00',
'2015-03-08 09:00:00+00:00',
'2015-03-08 09:30:00+00:00',
'2015-11-01 06:30:00+00:00',
'2015-11-01 07:00:00+00:00',
'2015-11-01 07:30:00+00:00',
'2015-11-01 08:00:00+00:00',
'2015-11-01 08:30:00+00:00',
'2015-11-01 08:00:00+00:00',
'2015-11-01 08:30:00+00:00',
'2015-11-01 09:00:00+00:00',
'2015-11-01 09:30:00+00:00',
'2015-11-01 10:00:00+00:00'])
= pd.to_datetime(col, utc=True)
utc_s utc_s
0 2015-03-08 08:00:00+00:00
1 2015-03-08 08:30:00+00:00
2 2015-03-08 09:00:00+00:00
3 2015-03-08 09:30:00+00:00
4 2015-11-01 06:30:00+00:00
...
9 2015-11-01 08:00:00+00:00
10 2015-11-01 08:30:00+00:00
11 2015-11-01 09:00:00+00:00
12 2015-11-01 09:30:00+00:00
13 2015-11-01 10:00:00+00:00
Length: 14, dtype: datetime64[ns, UTC]
'America/Denver') utc_s.dt.tz_convert(
0 2015-03-08 01:00:00-07:00
1 2015-03-08 01:30:00-07:00
2 2015-03-08 03:00:00-06:00
3 2015-03-08 03:30:00-06:00
4 2015-11-01 00:30:00-06:00
...
9 2015-11-01 01:00:00-07:00
10 2015-11-01 01:30:00-07:00
11 2015-11-01 02:00:00-07:00
12 2015-11-01 02:30:00-07:00
13 2015-11-01 03:00:00-07:00
Length: 14, dtype: datetime64[ns, America/Denver]
= pd.Series(['2015-03-08 01:00:00-07:00',
s '2015-03-08 01:30:00-07:00',
'2015-03-08 03:00:00-06:00',
'2015-03-08 03:30:00-06:00',
'2015-11-01 00:30:00-06:00',
'2015-11-01 01:00:00-06:00',
'2015-11-01 01:30:00-06:00',
'2015-11-01 01:00:00-07:00',
'2015-11-01 01:30:00-07:00',
'2015-11-01 01:00:00-07:00',
'2015-11-01 01:30:00-07:00',
'2015-11-01 02:00:00-07:00',
'2015-11-01 02:30:00-07:00',
'2015-11-01 03:00:00-07:00'])
=True).dt.tz_convert('America/Denver') pd.to_datetime(s, utc
0 2015-03-08 01:00:00-07:00
1 2015-03-08 01:30:00-07:00
2 2015-03-08 03:00:00-06:00
3 2015-03-08 03:30:00-06:00
4 2015-11-01 00:30:00-06:00
...
9 2015-11-01 01:00:00-07:00
10 2015-11-01 01:30:00-07:00
11 2015-11-01 02:00:00-07:00
12 2015-11-01 02:30:00-07:00
13 2015-11-01 03:00:00-07:00
Length: 14, dtype: datetime64[ns, America/Denver]
= pd.Series(['2015-03-08 01:00:00',
time '2015-03-08 01:30:00',
'2015-03-08 02:00:00',
'2015-03-08 02:30:00',
'2015-03-08 03:00:00',
'2015-03-08 02:00:00',
'2015-03-08 02:30:00',
'2015-03-08 03:00:00',
'2015-03-08 03:30:00',
'2015-11-01 00:30:00',
'2015-11-01 01:00:00',
'2015-11-01 01:30:00',
'2015-11-01 02:00:00',
'2015-11-01 02:30:00',
'2015-11-01 01:00:00',
'2015-11-01 01:30:00',
'2015-11-01 02:00:00',
'2015-11-01 02:30:00',
'2015-11-01 03:00:00'])
= pd.Series([-7, -7, -7, -7, -7, -6, -6,
offset -6, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7])
(pd.to_datetime(time)
.groupby(offset)lambda s: s.dt.tz_localize(s.name)
.transform('America/Denver'))
.dt.tz_convert( )
0 2015-03-07 18:00:07-07:00
1 2015-03-07 18:30:07-07:00
2 2015-03-07 19:00:07-07:00
3 2015-03-07 19:30:07-07:00
4 2015-03-07 20:00:07-07:00
...
14 2015-10-31 19:00:07-06:00
15 2015-10-31 19:30:07-06:00
16 2015-10-31 20:00:07-06:00
17 2015-10-31 20:30:07-06:00
18 2015-10-31 21:00:07-06:00
Length: 19, dtype: datetime64[ns, America/Denver]
= offset.replace({-7:'-07:00', -6:'-06:00'})
offset = (pd.to_datetime(time)
local
.groupby(offset)lambda s: s.dt.tz_localize(s.name)
.transform('America/Denver'))
.dt.tz_convert( )
local
0 2015-03-08 01:00:00-07:00
1 2015-03-08 01:30:00-07:00
2 2015-03-08 03:00:00-06:00
3 2015-03-08 03:30:00-06:00
4 2015-03-08 04:00:00-06:00
...
14 2015-11-01 01:00:00-07:00
15 2015-11-01 01:30:00-07:00
16 2015-11-01 02:00:00-07:00
17 2015-11-01 02:30:00-07:00
18 2015-11-01 03:00:00-07:00
Length: 19, dtype: datetime64[ns, America/Denver]
'UTC') local.dt.tz_convert(
0 2015-03-08 08:00:00+00:00
1 2015-03-08 08:30:00+00:00
2 2015-03-08 09:00:00+00:00
3 2015-03-08 09:30:00+00:00
4 2015-03-08 10:00:00+00:00
...
14 2015-11-01 08:00:00+00:00
15 2015-11-01 08:30:00+00:00
16 2015-11-01 09:00:00+00:00
17 2015-11-01 09:30:00+00:00
18 2015-11-01 10:00:00+00:00
Length: 19, dtype: datetime64[ns, UTC]
= local.view(int).floordiv(1e9).astype(int)
secs secs
0 1425801600
1 1425803400
2 1425805200
3 1425807000
4 1425808800
...
14 1446364800
15 1446366600
16 1446368400
17 1446370200
18 1446372000
Length: 19, dtype: int64
='s')
(pd.to_datetime(secs, unit'UTC')) .dt.tz_localize(
0 2015-03-08 08:00:00+00:00
1 2015-03-08 08:30:00+00:00
2 2015-03-08 09:00:00+00:00
3 2015-03-08 09:30:00+00:00
4 2015-03-08 10:00:00+00:00
...
14 2015-11-01 08:00:00+00:00
15 2015-11-01 08:30:00+00:00
16 2015-11-01 09:00:00+00:00
17 2015-11-01 09:30:00+00:00
18 2015-11-01 10:00:00+00:00
Length: 19, dtype: datetime64[ns, UTC]
= 'https://github.com/mattharrison/datasets'+\
url '/raw/master/data/alta-noaa-1980-2019.csv'
= pd.read_csv(url) alta_df
= pd.to_datetime(alta_df.DATE)
dates dates
0 1980-01-01
1 1980-01-02
2 1980-01-03
3 1980-01-04
4 1980-01-05
...
14155 2019-09-03
14156 2019-09-04
14157 2019-09-05
14158 2019-09-06
14159 2019-09-07
Name: DATE, Length: 14160, dtype: datetime64[ns]
'es_ES') dates.dt.day_name(
0 Martes
1 Miércoles
2 Jueves
3 Viernes
4 Sábado
...
14155 Martes
14156 Miércoles
14157 Jueves
14158 Viernes
14159 Sábado
Name: DATE, Length: 14160, dtype: object
dates.dt.is_month_end
0 False
1 False
2 False
3 False
4 False
...
14155 False
14156 False
14157 False
14158 False
14159 False
Name: DATE, Length: 14160, dtype: bool
'%d/%m/%y') dates.dt.strftime(
0 01/01/80
1 02/01/80
2 03/01/80
3 04/01/80
4 05/01/80
...
14155 03/09/19
14156 04/09/19
14157 05/09/19
14158 06/09/19
14159 07/09/19
Name: DATE, Length: 14160, dtype: object
Convert a column with date information to a date.
Convert a date column into UTC dates.
Convert a date column into local dates with a timezone.
Convert a date column into epoch values.
Convert an epoch number into UTC.
= 'https://github.com/mattharrison/datasets'+\
url '/raw/master/data/alta-noaa-1980-2019.csv'
= pd.read_csv(url)
alta_df = pd.to_datetime(alta_df.DATE) dates
= (alta_df
snow
.SNOW
.rename(dates) )
snow
1980-01-01 2.0
1980-01-02 3.0
1980-01-03 1.0
1980-01-04 0.0
1980-01-05 0.0
...
2019-09-03 0.0
2019-09-04 0.0
2019-09-05 0.0
2019-09-06 0.0
2019-09-07 0.0
Name: SNOW, Length: 14160, dtype: float64
any() snow.isna().
True
snow[snow.isna()]
1985-07-30 NaN
1985-09-12 NaN
1985-09-19 NaN
1986-02-07 NaN
1986-06-26 NaN
..
2017-04-26 NaN
2017-09-20 NaN
2017-10-02 NaN
2017-12-23 NaN
2018-12-03 NaN
Name: SNOW, Length: 365, dtype: float64
'1985-09':'1985-09-20'] snow.loc[
1985-09-01 0.0
1985-09-02 0.0
1985-09-03 0.0
1985-09-04 0.0
1985-09-05 0.0
...
1985-09-16 0.0
1985-09-17 0.0
1985-09-18 0.0
1985-09-19 NaN
1985-09-20 0.0
Name: SNOW, Length: 20, dtype: float64
(snow'1985-09':'1985-09-20']
.loc[0)
.fillna( )
1985-09-01 0.0
1985-09-02 0.0
1985-09-03 0.0
1985-09-04 0.0
1985-09-05 0.0
...
1985-09-16 0.0
1985-09-17 0.0
1985-09-18 0.0
1985-09-19 0.0
1985-09-20 0.0
Name: SNOW, Length: 20, dtype: float64
'1987-12-30':'1988-01-10'] snow.loc[
1987-12-30 6.0
1987-12-31 5.0
1988-01-01 NaN
1988-01-02 0.0
1988-01-03 0.0
...
1988-01-06 6.0
1988-01-07 4.0
1988-01-08 9.0
1988-01-09 5.0
1988-01-10 2.0
Name: SNOW, Length: 12, dtype: float64
(snow'1987-12-30':'1988-01-10']
.loc[
.ffill() )
1987-12-30 6.0
1987-12-31 5.0
1988-01-01 5.0
1988-01-02 0.0
1988-01-03 0.0
...
1988-01-06 6.0
1988-01-07 4.0
1988-01-08 9.0
1988-01-09 5.0
1988-01-10 2.0
Name: SNOW, Length: 12, dtype: float64
(snow'1987-12-30':'1988-01-10']
.loc[
.bfill() )
1987-12-30 6.0
1987-12-31 5.0
1988-01-01 0.0
1988-01-02 0.0
1988-01-03 0.0
...
1988-01-06 6.0
1988-01-07 4.0
1988-01-08 9.0
1988-01-09 5.0
1988-01-10 2.0
Name: SNOW, Length: 12, dtype: float64
(snow'1987-12-30':'1988-01-10']
.loc[
.interpolate() )
1987-12-30 6.0
1987-12-31 5.0
1988-01-01 2.5
1988-01-02 0.0
1988-01-03 0.0
...
1988-01-06 6.0
1988-01-07 4.0
1988-01-08 9.0
1988-01-09 5.0
1988-01-10 2.0
Name: SNOW, Length: 12, dtype: float64
= (snow.index.quarter == 1) | (snow.index.quarter== 4)
winter
(snow~(winter & snow.isna()), snow.interpolate())
.where(~(~winter & snow.isna()), 0)
.where( )
1980-01-01 2.0
1980-01-02 3.0
1980-01-03 1.0
1980-01-04 0.0
1980-01-05 0.0
...
2019-09-03 0.0
2019-09-04 0.0
2019-09-05 0.0
2019-09-06 0.0
2019-09-07 0.0
Name: SNOW, Length: 14160, dtype: float64
(snow~(winter & snow.isna()), snow.interpolate())
.where(~(~winter & snow.isna()), 0)
.where('1985-09-19','1988-01-01']]
.loc[[ )
1985-09-19 0.0
1988-01-01 2.5
Name: SNOW, dtype: float64
(snow'1987-12-30':'1988-01-10']
.loc[
.dropna() )
1987-12-30 6.0
1987-12-31 5.0
1988-01-02 0.0
1988-01-03 0.0
1988-01-05 2.0
1988-01-06 6.0
1988-01-07 4.0
1988-01-08 9.0
1988-01-09 5.0
1988-01-10 2.0
Name: SNOW, dtype: float64
1) snow.shift(
1980-01-01 NaN
1980-01-02 2.0
1980-01-03 3.0
1980-01-04 1.0
1980-01-05 0.0
...
2019-09-03 0.0
2019-09-04 0.0
2019-09-05 0.0
2019-09-06 0.0
2019-09-07 0.0
Name: SNOW, Length: 14160, dtype: float64
-1) snow.shift(
1980-01-01 3.0
1980-01-02 1.0
1980-01-03 0.0
1980-01-04 0.0
1980-01-05 1.0
...
2019-09-03 0.0
2019-09-04 0.0
2019-09-05 0.0
2019-09-06 0.0
2019-09-07 NaN
Name: SNOW, Length: 14160, dtype: float64
(snow1))
.add(snow.shift(2))
.add(snow.shift(3))
.add(snow.shift(4))
.add(snow.shift(5)
.div( )
1980-01-01 NaN
1980-01-02 NaN
1980-01-03 NaN
1980-01-04 NaN
1980-01-05 1.2
...
2019-09-03 0.0
2019-09-04 0.0
2019-09-05 0.0
2019-09-06 0.0
2019-09-07 0.0
Name: SNOW, Length: 14160, dtype: float64
(snow5)
.rolling(
.mean() )
1980-01-01 NaN
1980-01-02 NaN
1980-01-03 NaN
1980-01-04 NaN
1980-01-05 1.2
...
2019-09-03 0.0
2019-09-04 0.0
2019-09-05 0.0
2019-09-06 0.0
2019-09-07 0.0
Name: SNOW, Length: 14160, dtype: float64
(snow'M')
.resample(max()
. )
1980-01-31 20.0
1980-02-29 25.0
1980-03-31 16.0
1980-04-30 10.0
1980-05-31 9.0
...
2019-05-31 5.1
2019-06-30 0.0
2019-07-31 0.0
2019-08-31 0.0
2019-09-30 0.0
Freq: M, Name: SNOW, Length: 477, dtype: float64
(snow'2M')
.resample(max()
. )
1980-01-31 20.0
1980-03-31 25.0
1980-05-31 10.0
1980-07-31 1.0
1980-09-30 0.0
...
2019-01-31 19.0
2019-03-31 20.7
2019-05-31 18.0
2019-07-31 0.0
2019-09-30 0.0
Freq: 2M, Name: SNOW, Length: 239, dtype: float64
(snow'A-MAY')
.resample(max()
. )
1980-05-31 25.0
1981-05-31 26.0
1982-05-31 34.0
1983-05-31 38.0
1984-05-31 25.0
...
2016-05-31 15.0
2017-05-31 26.0
2018-05-31 21.8
2019-05-31 20.7
2020-05-31 0.0
Freq: A-MAY, Name: SNOW, Length: 41, dtype: float64
(snow
.div(snow'Q')
.resample('sum'))
.transform(100)
.mul(0)
.fillna( )
1980-01-01 0.527009
1980-01-02 0.790514
1980-01-03 0.263505
1980-01-04 0.000000
1980-01-05 0.000000
...
2019-09-03 0.000000
2019-09-04 0.000000
2019-09-05 0.000000
2019-09-06 0.000000
2019-09-07 0.000000
Name: SNOW, Length: 14160, dtype: float64
= snow.loc['2016-10':'2017-05']
season2017
(season2017'M')
.resample(sum()
.
.div(season2017sum())
.100)
.mul( )
2016-10-31 2.153969
2016-11-30 9.772637
2016-12-31 15.715995
2017-01-31 25.468688
2017-02-28 21.041085
2017-03-31 9.274033
2017-04-30 14.738732
2017-05-31 1.834862
Freq: M, Name: SNOW, dtype: float64
def season(idx):
= idx.year
year = idx.month
month return year.where((month < 10), year+1)
(snow
.groupby(season)sum()
. )
1980 457.5
1981 503.0
1982 842.5
1983 807.5
1984 816.0
...
2015 284.3
2016 354.6
2017 524.0
2018 308.8
2019 504.5
Name: SNOW, Length: 40, dtype: float64
(snow'A-SEP')
.resample(sum()
. )
1980-09-30 457.5
1981-09-30 503.0
1982-09-30 842.5
1983-09-30 807.5
1984-09-30 816.0
...
2015-09-30 284.3
2016-09-30 354.6
2017-09-30 524.0
2018-09-30 308.8
2019-09-30 504.5
Freq: A-SEP, Name: SNOW, Length: 40, dtype: float64
(snow'2016-10':'2017-09']
.loc[
.cumsum() )
2016-10-01 0.0
2016-10-02 0.0
2016-10-03 4.9
2016-10-04 4.9
2016-10-05 5.5
...
2017-09-26 524.0
2017-09-27 524.0
2017-09-28 524.0
2017-09-29 524.0
2017-09-30 524.0
Name: SNOW, Length: 364, dtype: float64
(snow'A-SEP')
.resample('cumsum')
.transform( )
1980-01-01 2.0
1980-01-02 5.0
1980-01-03 6.0
1980-01-04 6.0
1980-01-05 6.0
...
2019-09-03 504.5
2019-09-04 504.5
2019-09-05 504.5
2019-09-06 504.5
2019-09-07 504.5
Name: SNOW, Length: 14160, dtype: float64
Convert a column with date information to a date.
Put the date information into the index for a numeric column.
Calculate the average value of the column for each month.
Calculate the average value of the column for every 2 months.
Calculate the percentage of the column out of the total for each month.
Calculate the average value of the column for a rolling window of size 7.
Using
.loc
pull out the first 3 months of a year.Using
.loc
pull out the last 4 months of a year.
= 'https://github.com/mattharrison/datasets/raw/master/'\
url 'data/alta-noaa-1980-2019.csv'
= pd.read_csv(url) # doctest: +SKIP
alta_df = pd.to_datetime(alta_df.DATE)
dates = (alta_df
snow
.SNOW
.rename(dates) )
snow
1980-01-01 2.0
1980-01-02 3.0
1980-01-03 1.0
1980-01-04 0.0
1980-01-05 0.0
...
2019-09-03 0.0
2019-09-04 0.0
2019-09-05 0.0
2019-09-06 0.0
2019-09-07 0.0
Name: SNOW, Length: 14160, dtype: float64
# doctest: +SKIP snow.plot.hist()
>0].plot.hist(bins=20, title='Snowfall Histogram (in)') # doctest: +SKIP snow[snow
snow.plot.box()
(snowlambda s:(s.index.month == 1) & (s>0)]
[
.plot.box() )
(snowlambda s:(s.index.month == 1) & (s>0)]
[
.plot.kde() )
snow.plot.line()
(snow-300:]
.iloc[
.plot.line() )
(snow'M')
.resample(
.mean()
.plot.line() )
(snow'Q')
.resample(.5, .9, .99])
.quantile([
.unstack()-100:]
.iloc[
.plot.line() )
= (snow.loc['2016-10':'2017-05'])
season2017
(season2017'M')
.resample(sum()
.sum())
.div(season2017.100)
.mul(lambda idx: idx.month_name())
.rename( )
October 2.153969
November 9.772637
December 15.715995
January 25.468688
February 21.041085
March 9.274033
April 14.738732
May 1.834862
Name: SNOW, dtype: float64
(season2017'M')
.resample(sum()
.sum())
.div(season2017.100)
.mul(lambda idx: idx.month_name())
.rename(='2017 Monthly Percent of Snowfall')
.plot.bar(title )
(season2017'M')
.resample(sum()
.sum())
.div(season2017.100)
.mul(lambda idx: idx.month_name())
.rename(='2017 Monthly Percent of Snowfall')
.plot.barh(title )
= 'https://github.com/mattharrison/datasets/raw/master/data/'\
url 'vehicles.csv.zip'
= pd.read_csv(url) # doctest: +SKIP
df = df.make make
/home/matt/envs/menv/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3418: DtypeWarning: Columns (68,70,71,72,73,74,76,79) have mixed types.Specify dtype option on import or set low_memory=False.
exec(code_obj, self.user_global_ns, self.user_ns)
make.value_counts()
Chevrolet 4003
Ford 3371
Dodge 2583
GMC 2494
Toyota 2071
...
Volga Associated Automobile 1
Panos 1
Mahindra 1
Excalibur Autos 1
London Coach Co Inc 1
Name: make, Length: 136, dtype: int64
(make
.value_counts()
.plot.bar() )
= make.value_counts().index[:10]
top10
(make'Other')
.where(make.isin(top10),
.value_counts()
.plot.barh() )
(season2017'M')
.resample(sum()
.sum())
.div(season2017.100)
.mul(lambda idx: idx.month_name())
.rename(='2017 Monthly Percent of Snowfall')
.plot.pie(title )
import pandas as pd
= 'https://github.com/mattharrison/datasets/raw/master/' \
url 'data/vehicles.csv.zip'
= pd.read_csv(url) # doctest: +SKIP
df = df.make
make make
/home/matt/envs/menv/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3418: DtypeWarning: Columns (68,70,71,72,73,74,76,79) have mixed types.Specify dtype option on import or set low_memory=False.
exec(code_obj, self.user_global_ns, self.user_ns)
0 Alfa Romeo
1 Ferrari
2 Dodge
3 Dodge
4 Subaru
...
41139 Subaru
41140 Subaru
41141 Subaru
41142 Subaru
41143 Subaru
Name: make, Length: 41144, dtype: object
make.value_counts()
Chevrolet 4003
Ford 3371
Dodge 2583
GMC 2494
Toyota 2071
...
Volga Associated Automobile 1
Panos 1
Mahindra 1
Excalibur Autos 1
London Coach Co Inc 1
Name: make, Length: 136, dtype: int64
make.shape, make.nunique()
((41144,), 136)
= make.astype('category') cat_make
=True) make.memory_usage(deep
2606395
=True) cat_make.memory_usage(deep
95888
%%timeit
UsageError: %%timeit is a cell magic, but the cell body is empty. Did you mean the line magic %timeit (single %)?
%%timeit
= pd.CategoricalDtype(
make_type =sorted(make.unique()), ordered=True)
categories= make.astype(make_type)
ordered_make ordered_make
0 Alfa Romeo
1 Ferrari
2 Dodge
3 Dodge
4 Subaru
...
41139 Subaru
41140 Subaru
41141 Subaru
41142 Subaru
41143 Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General' < 'ASC Incorporated' < 'Acura' < 'Alfa Romeo' ... 'Volvo' < 'Wallace Environmental' < 'Yugo' < 'smart']
max() ordered_make.
'smart'
max() cat_make.
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-208-0dd04fc39a22> in <module> ----> 1 cat_make.max() ~/envs/menv/lib/python3.8/site-packages/pandas/core/generic.py in max(self, axis, skipna, level, numeric_only, **kwargs) 10817 ) 10818 def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): > 10819 return NDFrame.max(self, axis, skipna, level, numeric_only, **kwargs) 10820 10821 setattr(cls, "max", max) ~/envs/menv/lib/python3.8/site-packages/pandas/core/generic.py in max(self, axis, skipna, level, numeric_only, **kwargs) 10362 10363 def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): > 10364 return self._stat_function( 10365 "max", nanops.nanmax, axis, skipna, level, numeric_only, **kwargs 10366 ) ~/envs/menv/lib/python3.8/site-packages/pandas/core/generic.py in _stat_function(self, name, func, axis, skipna, level, numeric_only, **kwargs) 10352 name, axis=axis, level=level, skipna=skipna, numeric_only=numeric_only 10353 ) > 10354 return self._reduce( 10355 func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only 10356 ) ~/envs/menv/lib/python3.8/site-packages/pandas/core/series.py in _reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds) 4381 if isinstance(delegate, ExtensionArray): 4382 # dispatch to ExtensionArray interface -> 4383 return delegate._reduce(name, skipna=skipna, **kwds) 4384 4385 else: ~/envs/menv/lib/python3.8/site-packages/pandas/core/arrays/_mixins.py in _reduce(self, name, skipna, **kwargs) 255 meth = getattr(self, name, None) 256 if meth: --> 257 return meth(skipna=skipna, **kwargs) 258 else: 259 msg = f"'{type(self).__name__}' does not implement reduction '{name}'" ~/envs/menv/lib/python3.8/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs) 205 else: 206 kwargs[new_arg_name] = new_arg_value --> 207 return func(*args, **kwargs) 208 209 return cast(F, wrapper) ~/envs/menv/lib/python3.8/site-packages/pandas/core/arrays/categorical.py in max(self, skipna, **kwargs) 2145 nv.validate_minmax_axis(kwargs.get("axis", 0)) 2146 nv.validate_max((), kwargs) -> 2147 self.check_for_ordered("max") 2148 2149 if not len(self._codes): ~/envs/menv/lib/python3.8/site-packages/pandas/core/arrays/categorical.py in check_for_ordered(self, op) 1643 """assert that we are ordered""" 1644 if not self.ordered: -> 1645 raise TypeError( 1646 f"Categorical is not ordered for operation {op}\n" 1647 "you can use .as_ordered() to change the " TypeError: Categorical is not ordered for operation max you can use .as_ordered() to change the Categorical to an ordered one
ordered_make.sort_values()
20288 AM General
20289 AM General
369 AM General
358 AM General
19314 AM General
...
31289 smart
31290 smart
29605 smart
22974 smart
26882 smart
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General' < 'ASC Incorporated' < 'Acura' < 'Alfa Romeo' ... 'Volvo' < 'Wallace Environmental' < 'Yugo' < 'smart']
cat_make.cat.rename_categories(for c in cat_make.cat.categories]) [c.lower()
0 alfa romeo
1 ferrari
2 dodge
3 dodge
4 subaru
...
41139 subaru
41140 subaru
41141 subaru
41142 subaru
41143 subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['am general', 'asc incorporated', 'acura', 'alfa romeo', ..., 'volvo', 'wallace environmental', 'yugo', 'smart']
ordered_make.cat.rename_categories(for c in ordered_make.cat.categories}) {c:c.lower()
0 alfa romeo
1 ferrari
2 dodge
3 dodge
4 subaru
...
41139 subaru
41140 subaru
41141 subaru
41142 subaru
41143 subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['am general' < 'asc incorporated' < 'acura' < 'alfa romeo' ... 'volvo' < 'wallace environmental' < 'yugo' < 'smart']
ordered_make.cat.reorder_categories( sorted(cat_make.cat.categories, key=str.lower))
0 Alfa Romeo
1 Ferrari
2 Dodge
3 Dodge
4 Subaru
...
41139 Subaru
41140 Subaru
41141 Subaru
41142 Subaru
41143 Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['Acura' < 'Alfa Romeo' < 'AM General' < 'American Motors Corporation' ... 'Volvo' < 'VPG' < 'Wallace Environmental' < 'Yugo']
100].value_counts() ordered_make.iloc[:
Dodge 17
Oldsmobile 8
Ford 8
Buick 7
Chevrolet 5
..
Grumman Allied Industries 0
Goldacre 0
Geo 0
Genesis 0
smart 0
Name: make, Length: 136, dtype: int64
(cat_make100]
.iloc[:100])
.groupby(cat_make.iloc[:
.first() )
make
AM General NaN
ASC Incorporated NaN
Acura NaN
Alfa Romeo Alfa Romeo
American Motors Corporation NaN
...
Volkswagen Volkswagen
Volvo Volvo
Wallace Environmental NaN
Yugo NaN
smart NaN
Name: make, Length: 136, dtype: category
Categories (136, object): ['AM General', 'ASC Incorporated', 'Acura', 'Alfa Romeo', ..., 'Volvo', 'Wallace Environmental', 'Yugo', 'smart']
(make100]
.iloc[:100])
.groupby(make.iloc[:
.first() )
make
Alfa Romeo Alfa Romeo
Audi Audi
BMW BMW
Buick Buick
CX Automotive CX Automotive
...
Rolls-Royce Rolls-Royce
Subaru Subaru
Toyota Toyota
Volkswagen Volkswagen
Volvo Volvo
Name: make, Length: 25, dtype: object
(cat_make100]
.iloc[:100], observed=True)
.groupby(cat_make.iloc[:
.first() )
make
Alfa Romeo Alfa Romeo
Ferrari Ferrari
Dodge Dodge
Subaru Subaru
Toyota Toyota
...
Mazda Mazda
Oldsmobile Oldsmobile
Plymouth Plymouth
Pontiac Pontiac
Rolls-Royce Rolls-Royce
Name: make, Length: 25, dtype: category
Categories (136, object): ['AM General', 'ASC Incorporated', 'Acura', 'Alfa Romeo', ..., 'Volvo', 'Wallace Environmental', 'Yugo', 'smart']
0] ordered_make.iloc[
'Alfa Romeo'
0]] ordered_make.iloc[[
0 Alfa Romeo
Name: make, dtype: category
Categories (136, object): ['AM General' < 'ASC Incorporated' < 'Acura' < 'Alfa Romeo' ... 'Volvo' < 'Wallace Environmental' < 'Yugo' < 'smart']
def generalize_topn(ser, n=5, other='Other'):
= ser.value_counts().index[:n]
topn if isinstance(ser.dtype, pd.CategoricalDtype):
= ser.cat.set_categories(
ser list(topn)+[other]))
topn.set_categories(return ser.where(ser.isin(topn), other)
=20, other='NA') cat_make.pipe(generalize_topn, n
0 NA
1 NA
2 Dodge
3 Dodge
4 Subaru
...
41139 Subaru
41140 Subaru
41141 Subaru
41142 Subaru
41143 Subaru
Name: make, Length: 41144, dtype: category
Categories (21, object): ['Chevrolet', 'Ford', 'Dodge', 'GMC', ..., 'Volvo', 'Hyundai', 'Chrysler', 'NA']
def generalize_mapping(ser, mapping, default):
= None
seen = ser.astype(str)
res for old, new in mapping.items():
= ser.str.contains(old)
mask if seen is None:
= mask
seen else:
|= mask
seen = res.where(~mask, new)
res = res.where(seen, default)
res return res.astype('category')
'Ford': 'US', 'Tesla': 'US',
generalize_mapping(cat_make, {'Chevrolet': 'US', 'Dodge': 'US',
'Oldsmobile': 'US', 'Plymouth': 'US',
'BMW': 'German'}, 'Other')
0 Other
1 Other
2 US
3 US
4 Other
...
41139 Other
41140 Other
41141 Other
41142 Other
41143 Other
Name: make, Length: 41144, dtype: category
Categories (3, object): ['German', 'Other', 'US']