import pandas as pd
import numpy as np
from pandas import Series, DataFrame
See Comparison with R data.frame
it can be thought of as a dictionary of Series all sharing the same index.
Creation
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
"year": [2000, 2001, 2002, 2001, 2002, 2003],
"pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame
| 0 |
Ohio |
2000 |
1.5 |
| 1 |
Ohio |
2001 |
1.7 |
| 2 |
Ohio |
2002 |
3.6 |
| 3 |
Nevada |
2001 |
2.4 |
| 4 |
Nevada |
2002 |
2.9 |
| 5 |
Nevada |
2003 |
3.2 |
0 Ohio
1 Ohio
2 Ohio
3 Nevada
4 Nevada
5 Nevada
Name: state, dtype: object
# With specified column arrangement
frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
frame2
| 0 |
2000 |
Ohio |
1.5 |
NaN |
| 1 |
2001 |
Ohio |
1.7 |
NaN |
| 2 |
2002 |
Ohio |
3.6 |
NaN |
| 3 |
2001 |
Nevada |
2.4 |
NaN |
| 4 |
2002 |
Nevada |
2.9 |
NaN |
| 5 |
2003 |
Nevada |
3.2 |
NaN |
Index(['year', 'state', 'pop', 'debt'], dtype='object')
# Shape
frame.shape
# Head
frame.head()
# Tail
frame.tail(3)
| 3 |
Nevada |
2001 |
2.4 |
| 4 |
Nevada |
2002 |
2.9 |
| 5 |
Nevada |
2003 |
3.2 |
Nested dictionary
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
"Nevada": {2001: 2.4, 2002: 2.9}}
pd.DataFrame(populations)
| 2000 |
1.5 |
NaN |
| 2001 |
1.7 |
2.4 |
| 2002 |
3.6 |
2.9 |
Rows & Colums
Index(['state', 'year', 'pop'], dtype='object')
Select Column
# Quote or unquote
frame2["state"]
frame2["state"] is frame2.state
# Select multiple column # type: ignore
frame2sub = frame2[["state", "pop"]]
frame2sub
| 0 |
Ohio |
1.5 |
| 1 |
Ohio |
1.7 |
| 2 |
Ohio |
3.6 |
| 3 |
Nevada |
2.4 |
| 4 |
Nevada |
2.9 |
| 5 |
Nevada |
3.2 |
Slice
.iloc: integer location
frame.iloc[:3] # First 3 rows
| 0 |
Ohio |
2000 |
1.5 |
| 1 |
Ohio |
2001 |
1.7 |
| 2 |
Ohio |
2002 |
3.6 |
# Rows, Colums
frame.iloc[[0,1], [0, 2]]
.loc: Location by Labels
frame.loc[[0, 1], ["state", "year"]]
frame.loc[0:2, "state":"pop"]
| 0 |
Ohio |
2000 |
1.5 |
| 1 |
Ohio |
2001 |
1.7 |
| 2 |
Ohio |
2002 |
3.6 |
Filter Rows
Using query()
frame2[frame2.state == "Ohio"]
# Or
frame2.query('state == "Ohio"')
| 0 |
2000 |
Ohio |
1.5 |
NaN |
| 1 |
2001 |
Ohio |
1.7 |
NaN |
| 2 |
2002 |
Ohio |
3.6 |
NaN |
frame2.query('pop > 1.5')
| 1 |
2001 |
Ohio |
1.7 |
NaN |
| 2 |
2002 |
Ohio |
3.6 |
NaN |
| 3 |
2001 |
Nevada |
2.4 |
NaN |
| 4 |
2002 |
Nevada |
2.9 |
NaN |
| 5 |
2003 |
Nevada |
3.2 |
NaN |
Mutate (assign)
R: mutate(df, c=a-b)
Py: df.assign(c=df['a']-df['b'])
frame2.assign(pop2 = frame2["pop"] * 2)
| 0 |
2000 |
Ohio |
1.5 |
NaN |
3.0 |
| 1 |
2001 |
Ohio |
1.7 |
NaN |
3.4 |
| 2 |
2002 |
Ohio |
3.6 |
NaN |
7.2 |
| 3 |
2001 |
Nevada |
2.4 |
NaN |
4.8 |
| 4 |
2002 |
Nevada |
2.9 |
NaN |
5.8 |
| 5 |
2003 |
Nevada |
3.2 |
NaN |
6.4 |
frame2["debt"] = np.arange(6)
frame2
| 0 |
2000 |
Ohio |
1.5 |
0 |
| 1 |
2001 |
Ohio |
1.7 |
1 |
| 2 |
2002 |
Ohio |
3.6 |
2 |
| 3 |
2001 |
Nevada |
2.4 |
3 |
| 4 |
2002 |
Nevada |
2.9 |
4 |
| 5 |
2003 |
Nevada |
3.2 |
5 |
Rename
rename(df, col_one = col1)
df.rename(columns={'col1': 'col_one'})
frame2.rename(columns = {'year': 'yr'})
| 0 |
2000 |
Ohio |
1.5 |
0 |
| 1 |
2001 |
Ohio |
1.7 |
1 |
| 2 |
2002 |
Ohio |
3.6 |
2 |
| 3 |
2001 |
Nevada |
2.4 |
3 |
| 4 |
2002 |
Nevada |
2.9 |
4 |
| 5 |
2003 |
Nevada |
3.2 |
5 |
Summarize
Describe
| count |
6.000000 |
6.000000 |
6.000000 |
| mean |
2001.500000 |
2.550000 |
2.500000 |
| std |
1.048809 |
0.836062 |
1.870829 |
| min |
2000.000000 |
1.500000 |
0.000000 |
| 25% |
2001.000000 |
1.875000 |
1.250000 |
| 50% |
2001.500000 |
2.650000 |
2.500000 |
| 75% |
2002.000000 |
3.125000 |
3.750000 |
| max |
2003.000000 |
3.600000 |
5.000000 |
Group by, summarize
frame2.groupby('state').agg({'pop': 'mean'})
| state |
|
| Nevada |
2.833333 |
| Ohio |
2.266667 |
Count
frame2.value_counts("year")
year
2001 2
2002 2
2000 1
2003 1
Name: count, dtype: int64
frame2.value_counts(["year", "state"], sort=True)
year state
2000 Ohio 1
2001 Nevada 1
Ohio 1
2002 Nevada 1
Ohio 1
2003 Nevada 1
Name: count, dtype: int64