import pandas as pd
import numpy as np
from pandas import Series, DataFrame
from pyhere import here
from palmerpenguins import load_penguins7 Pandas Data Frame (filter)
titanic = pd.read_csv(here("data/titanic/titanic.csv"))
penguins = load_penguins()7.1 Filter rows with Regex
df = pd.DataFrame({"First": ["Mary", "James"], "Sur": ["Pop", "Bond"]})df[df['First'].str.contains('Ma', regex=True)]| First | Sur | |
|---|---|---|
| 0 | Mary | Pop |
df[df['First'].str.contains('ma', case=False)]| First | Sur | |
|---|---|---|
| 0 | Mary | Pop |
7.2 Filter Multiple Columes
7.2.1 Manual Method
df = pd.DataFrame({"First": ["Mary", "James"], "Sur": ["Pop", "Bond"]})
query_text = "Po"
df[df['First'].str.contains(query_text, case=False) | df['Sur'].str.contains(query_text, case=False)]| First | Sur | |
|---|---|---|
| 0 | Mary | Pop |
7.2.2 Automatic method
# Create the DataFrame
df = pd.DataFrame({"First": ["Mary", "James"], "Sur": ["Pop", "Bond"]})
# Define the query text
query_text = "Po"
# Create a mask for each column and combine them using 'any' along the columns
mask = df.apply(lambda column: column.str.contains(query_text, case=False)).any(axis=1)
# Filter the DataFrame using the combined mask
filtered_df = df[mask]
filtered_df| First | Sur | |
|---|---|---|
| 0 | Mary | Pop |
def query_any_column_df(df, query_text: str, case=False):
import pandas as pd
# Create a mask for each column and combine them using 'any' along the columns
mask = df.apply(lambda column: column.str.contains(query_text, case=case)).any(axis=1)
# Filter the DataFrame using the combined mask
filtered_df = df[mask]
return filtered_dfquery_any_column_df(df, "Ma")| First | Sur | |
|---|---|---|
| 0 | Mary | Pop |
df_zero = query_any_column_df(df, "0")df_zero.shape(0, 2)
query_any_column_df(penguins[["species", "island", "sex"]], "Chin")| species | island | sex | |
|---|---|---|---|
| 276 | Chinstrap | Dream | female |
| 277 | Chinstrap | Dream | male |
| 278 | Chinstrap | Dream | male |
| 279 | Chinstrap | Dream | female |
| 280 | Chinstrap | Dream | male |
| ... | ... | ... | ... |
| 339 | Chinstrap | Dream | male |
| 340 | Chinstrap | Dream | female |
| 341 | Chinstrap | Dream | male |
| 342 | Chinstrap | Dream | male |
| 343 | Chinstrap | Dream | female |
68 rows × 3 columns
df.apply(lambda column: column.str.contains(query_text, case=False))| First | Sur | |
|---|---|---|
| 0 | False | True |
| 1 | False | False |
7.3 Case When: .case_when()
import pandas as pd
# Create a sample Series
s = pd.Series([10, 25, 3, 15, 8, 30])
# Apply case_when to categorize values
result = s.case_when(
caselist=[
(s < 10, "Small"),
(s.between(10, 20), "Medium"),
(s > 20, "Large")
]
)
print(result)0 Medium
1 Large
2 Small
3 Medium
4 Small
5 Large
dtype: object