6  Pandas Data Frame

import pandas as pd
import numpy as np
from pandas import Series, DataFrame

See Comparison with R data.frame

it can be thought of as a dictionary of Series all sharing the same index.

6.1 Creation

data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
        
frame = pd.DataFrame(data)
frame
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
5 Nevada 2003 3.2
frame["state"]
0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object
# With specified column arrangement 
frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
frame2
year state pop debt
0 2000 Ohio 1.5 NaN
1 2001 Ohio 1.7 NaN
2 2002 Ohio 3.6 NaN
3 2001 Nevada 2.4 NaN
4 2002 Nevada 2.9 NaN
5 2003 Nevada 3.2 NaN
frame2.columns
Index(['year', 'state', 'pop', 'debt'], dtype='object')
# Shape
frame.shape
# Head
frame.head()
# Tail
frame.tail(3)
state year pop
3 Nevada 2001 2.4
4 Nevada 2002 2.9
5 Nevada 2003 3.2

Nested dictionary

populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
               "Nevada": {2001: 2.4, 2002: 2.9}}

pd.DataFrame(populations)
Ohio Nevada
2000 1.5 NaN
2001 1.7 2.4
2002 3.6 2.9

6.2 Rows & Colums

frame.shape
(6, 3)
frame.columns
Index(['state', 'year', 'pop'], dtype='object')

6.3 Select Column

# Quote or unquote
frame2["state"]

frame2["state"] is frame2.state
True
# Select multiple column # type: ignore
frame2sub = frame2[["state", "pop"]]
frame2sub
state pop
0 Ohio 1.5
1 Ohio 1.7
2 Ohio 3.6
3 Nevada 2.4
4 Nevada 2.9
5 Nevada 3.2

6.4 Slice

6.4.1 .iloc: integer location

frame.iloc[:3] # First 3 rows
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
# Rows, Colums
frame.iloc[[0,1], [0, 2]]
state pop
0 Ohio 1.5
1 Ohio 1.7

6.4.2 .loc: Location by Labels

frame.loc[[0, 1], ["state", "year"]]
state year
0 Ohio 2000
1 Ohio 2001
frame.loc[0:2, "state":"pop"]
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6

6.5 Filter Rows

6.5.1 Using query()

frame2[frame2.state == "Ohio"]
# Or
frame2.query('state == "Ohio"')
year state pop debt
0 2000 Ohio 1.5 NaN
1 2001 Ohio 1.7 NaN
2 2002 Ohio 3.6 NaN
frame2.query('pop > 1.5')
year state pop debt
1 2001 Ohio 1.7 NaN
2 2002 Ohio 3.6 NaN
3 2001 Nevada 2.4 NaN
4 2002 Nevada 2.9 NaN
5 2003 Nevada 3.2 NaN

6.6 Mutate (assign)

R: mutate(df, c=a-b)

Py: df.assign(c=df['a']-df['b'])

frame2.assign(pop2 = frame2["pop"] * 2)
year state pop debt pop2
0 2000 Ohio 1.5 NaN 3.0
1 2001 Ohio 1.7 NaN 3.4
2 2002 Ohio 3.6 NaN 7.2
3 2001 Nevada 2.4 NaN 4.8
4 2002 Nevada 2.9 NaN 5.8
5 2003 Nevada 3.2 NaN 6.4
frame2["debt"] = np.arange(6)
frame2
year state pop debt
0 2000 Ohio 1.5 0
1 2001 Ohio 1.7 1
2 2002 Ohio 3.6 2
3 2001 Nevada 2.4 3
4 2002 Nevada 2.9 4
5 2003 Nevada 3.2 5

6.6.1 Rename

rename(df, col_one = col1)
df.rename(columns={'col1': 'col_one'})
frame2.rename(columns = {'year': 'yr'})
yr state pop debt
0 2000 Ohio 1.5 0
1 2001 Ohio 1.7 1
2 2002 Ohio 3.6 2
3 2001 Nevada 2.4 3
4 2002 Nevada 2.9 4
5 2003 Nevada 3.2 5

6.7 Summarize

6.7.1 Describe

frame2.describe()
year pop debt
count 6.000000 6.000000 6.000000
mean 2001.500000 2.550000 2.500000
std 1.048809 0.836062 1.870829
min 2000.000000 1.500000 0.000000
25% 2001.000000 1.875000 1.250000
50% 2001.500000 2.650000 2.500000
75% 2002.000000 3.125000 3.750000
max 2003.000000 3.600000 5.000000

6.7.2 Group by, summarize

frame2.groupby('state').agg({'pop': 'mean'})
pop
state
Nevada 2.833333
Ohio 2.266667

6.8 Count

frame2.value_counts("year")
year
2001    2
2002    2
2000    1
2003    1
Name: count, dtype: int64
frame2.value_counts(["year", "state"], sort=True)
year  state 
2000  Ohio      1
2001  Nevada    1
      Ohio      1
2002  Nevada    1
      Ohio      1
2003  Nevada    1
Name: count, dtype: int64