7  Pandas Data Frame (filter)

import pandas as pd
import numpy as np
from pandas import Series, DataFrame
from pyhere import here
from palmerpenguins import load_penguins
titanic = pd.read_csv(here("data/titanic/titanic.csv"))
penguins = load_penguins()

7.1 Filter rows with Regex

df = pd.DataFrame({"First": ["Mary", "James"], "Sur": ["Pop", "Bond"]})
df[df['First'].str.contains('Ma', regex=True)]
First Sur
0 Mary Pop
df[df['First'].str.contains('ma', case=False)]
First Sur
0 Mary Pop

7.2 Filter Multiple Columes

7.2.1 Manual Method

df = pd.DataFrame({"First": ["Mary", "James"], "Sur": ["Pop", "Bond"]})

query_text = "Po"

df[df['First'].str.contains(query_text, case=False) | df['Sur'].str.contains(query_text, case=False)]
First Sur
0 Mary Pop

7.2.2 Automatic method

# Create the DataFrame
df = pd.DataFrame({"First": ["Mary", "James"], "Sur": ["Pop", "Bond"]})

# Define the query text
query_text = "Po"

# Create a mask for each column and combine them using 'any' along the columns
mask = df.apply(lambda column: column.str.contains(query_text, case=False)).any(axis=1)

# Filter the DataFrame using the combined mask
filtered_df = df[mask]
filtered_df
First Sur
0 Mary Pop
def query_any_column_df(df, query_text: str, case=False):
    import pandas as pd
    # Create a mask for each column and combine them using 'any' along the columns
    mask = df.apply(lambda column: column.str.contains(query_text, case=case)).any(axis=1)

    # Filter the DataFrame using the combined mask
    filtered_df = df[mask]
    return filtered_df
query_any_column_df(df, "Ma")
First Sur
0 Mary Pop
df_zero = query_any_column_df(df, "0")
df_zero.shape
(0, 2)
query_any_column_df(penguins[["species", "island", "sex"]], "Chin")
species island sex
276 Chinstrap Dream female
277 Chinstrap Dream male
278 Chinstrap Dream male
279 Chinstrap Dream female
280 Chinstrap Dream male
... ... ... ...
339 Chinstrap Dream male
340 Chinstrap Dream female
341 Chinstrap Dream male
342 Chinstrap Dream male
343 Chinstrap Dream female

68 rows × 3 columns

df.apply(lambda column: column.str.contains(query_text, case=False))
First Sur
0 False True
1 False False

7.3 Case When: .case_when()

import pandas as pd

# Create a sample Series
s = pd.Series([10, 25, 3, 15, 8, 30])

# Apply case_when to categorize values
result = s.case_when(
    caselist=[
        (s < 10, "Small"),
        (s.between(10, 20), "Medium"),
        (s > 20, "Large")
    ]
)

print(result)
0    Medium
1     Large
2     Small
3    Medium
4     Small
5     Large
dtype: object