import pandas as pd
import numpy as np
from pandas import Series, DataFrame
from pyhere import here
from palmerpenguins import load_penguins
7 Pandas Data Frame (filter)
= pd.read_csv(here("data/titanic/titanic.csv"))
titanic = load_penguins() penguins
7.1 Filter rows with Regex
= pd.DataFrame({"First": ["Mary", "James"], "Sur": ["Pop", "Bond"]}) df
'First'].str.contains('Ma', regex=True)] df[df[
First | Sur | |
---|---|---|
0 | Mary | Pop |
'First'].str.contains('ma', case=False)] df[df[
First | Sur | |
---|---|---|
0 | Mary | Pop |
7.2 Filter Multiple Columes
7.2.1 Manual Method
= pd.DataFrame({"First": ["Mary", "James"], "Sur": ["Pop", "Bond"]})
df
= "Po"
query_text
'First'].str.contains(query_text, case=False) | df['Sur'].str.contains(query_text, case=False)] df[df[
First | Sur | |
---|---|---|
0 | Mary | Pop |
7.2.2 Automatic method
# Create the DataFrame
= pd.DataFrame({"First": ["Mary", "James"], "Sur": ["Pop", "Bond"]})
df
# Define the query text
= "Po"
query_text
# Create a mask for each column and combine them using 'any' along the columns
= df.apply(lambda column: column.str.contains(query_text, case=False)).any(axis=1)
mask
# Filter the DataFrame using the combined mask
= df[mask]
filtered_df filtered_df
First | Sur | |
---|---|---|
0 | Mary | Pop |
def query_any_column_df(df, query_text: str, case=False):
import pandas as pd
# Create a mask for each column and combine them using 'any' along the columns
= df.apply(lambda column: column.str.contains(query_text, case=case)).any(axis=1)
mask
# Filter the DataFrame using the combined mask
= df[mask]
filtered_df return filtered_df
"Ma") query_any_column_df(df,
First | Sur | |
---|---|---|
0 | Mary | Pop |
= query_any_column_df(df, "0") df_zero
df_zero.shape
(0, 2)
"species", "island", "sex"]], "Chin") query_any_column_df(penguins[[
species | island | sex | |
---|---|---|---|
276 | Chinstrap | Dream | female |
277 | Chinstrap | Dream | male |
278 | Chinstrap | Dream | male |
279 | Chinstrap | Dream | female |
280 | Chinstrap | Dream | male |
... | ... | ... | ... |
339 | Chinstrap | Dream | male |
340 | Chinstrap | Dream | female |
341 | Chinstrap | Dream | male |
342 | Chinstrap | Dream | male |
343 | Chinstrap | Dream | female |
68 rows × 3 columns
apply(lambda column: column.str.contains(query_text, case=False)) df.
First | Sur | |
---|---|---|
0 | False | True |
1 | False | False |
7.3 Case When: .case_when()
import pandas as pd
# Create a sample Series
= pd.Series([10, 25, 3, 15, 8, 30])
s
# Apply case_when to categorize values
= s.case_when(
result =[
caselist< 10, "Small"),
(s 10, 20), "Medium"),
(s.between(> 20, "Large")
(s
]
)
print(result)
0 Medium
1 Large
2 Small
3 Medium
4 Small
5 Large
dtype: object