14 Pandas: `.pipe`

The .pipe() method in pandas allows you to chain operations in a more functional and readable way. It allows for cleaner code, especially when you have multiple transformations that need to be applied to a DataFrame. You can use .pipe() together with other pandas methods like .assign() and .apply() to perform various transformations.

How .pipe() Works

.pipe() passes the current DataFrame (or Series) to a function as its first argument. You can chain multiple operations in a functional style, which is similar to the %>% pipe operator in R’s dplyr.

Let’s build a complete example where we:

Use .assign() to create new columns.
Use .apply() to apply a custom function to each row.
Chain these operations together with .pipe().

14.1 Basic Pipe

14.1.1 Separate Functions

import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'col1': [1, 2, 3, 4],
    'col2': [5, 6, 7, 8],
    'col3': [9, 10, 11, 12]
})
df

	col1	col2	col3
0	1	5	9
1	2	6	10
2	3	7	11
3	4	8	12

# Define a function to sum three columns
def add_columns(df):
    return df.assign(col_sum=df['col1'] + df['col2'] + df['col3'])

# Define a function to apply across rows
def apply_product(df):
    df['col_product'] = df.apply(lambda row: row['col1'] * row['col2'] * row['col3'], axis=1)
    return df

# Chaining operations with pipe
df_transformed = (df
                  .pipe(add_columns)  # Add a column using assign
                  .pipe(apply_product)  # Apply a custom function using apply
                 )

df_transformed

	col1	col2	col3	col_sum	col_product
0	1	5	9	15	45
1	2	6	10	18	120
2	3	7	11	21	231
3	4	8	12	24	384

14.1.2 Pipe Inline

import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'col1': [1, 2, 3, 4],
    'col2': [5, 6, 7, 8],
    'col3': [9, 10, 11, 12]
})

# Chaining operations using pipe with lambda functions
df_transformed = (df
                  # First pipe: Add a new column 'col_sum' using lambda in pipe
                  .pipe(lambda df: df.assign(col_sum=df['col1'] + df['col2'] + df['col3']))
                  # Second pipe: Apply a row-wise product calculation using apply
                  .pipe(lambda df: df.assign(col_product=df.apply(lambda row: row['col1'] * row['col2'] * row['col3'], axis=1)))
                 )

df_transformed

	col1	col2	col3	col_sum	col_product
0	1	5	9	15	45
1	2	6	10	18	120
2	3	7	11	21	231
3	4	8	12	24	384

14.2 Examples

14.2.1 Transformations with `.assign()` and Conditional Logic

import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'col1': [1, 2, 3, 4],
    'col2': [5, 6, 7, 8],
    'col3': [9, 10, 11, 12]
})

# Chaining transformations using pipe and lambda
df_transformed = (
    df
    .pipe(lambda df: df.assign(col_sum = df['col1'] + df['col2'] + df['col3']))  # Sum of columns
    .pipe(lambda df: df.assign(col_flag = df['col_sum'].apply(lambda x: 'high' if x > 20 else 'low')))  # Conditional column
    .pipe(lambda df: df.assign(col_cumsum = df['col_sum'].cumsum()))  # Cumulative sum of col_sum
)

df_transformed

	col1	col2	col3	col_sum	col_flag	col_cumsum
0	1	5	9	15	low	15
1	2	6	10	18	low	33
2	3	7	11	21	high	54
3	4	8	12	24	high	78

14.2.2 Filtering Rows and Renaming Columns

# Chaining multiple transformations with filter and rename
df_filtered_renamed = (
    df
    .pipe(lambda df: df[df['col1'] > 2])  # Filter rows where col1 > 2
    .pipe(lambda df: df.rename(columns = {'col1': 'new_col1', 'col2': 'new_col2'}))  # Rename columns
)

df_filtered_renamed

	new_col1	new_col2	col3
2	3	7	11
3	4	8	12

14.2.3 Grouping, Aggregating, and Adding Calculations

# Sample DataFrame with groupable data
df_group = pd.DataFrame({
    'group': ['A', 'A', 'B', 'B', 'C'],
    'col1': [1, 2, 3, 4, 5],
    'col2': [10, 20, 30, 40, 50]
})

# Chaining groupby, aggregation, and adding new calculated columns
df_grouped = (
    df_group
    .pipe(lambda df: df.groupby('group').agg(col1_sum=('col1', 'sum'), col2_mean=('col2', 'mean')))  # Groupby and aggregate
    .pipe(lambda df: df.assign(ratio=df['col1_sum'] / df['col2_mean']))  # Add a new calculated column (ratio)
)

df_grouped

	col1_sum	col2_mean	ratio
group
A	3	15.0	0.2
B	7	35.0	0.2
C	5	50.0	0.1

14.2.4 Sorting and Selecting Specific Columns

# Chaining sort and column selection
df_sorted_selected = (
    df
    .pipe(lambda df: df.sort_values(by='col3', ascending=False))  # Sort by col3 in descending order
    .pipe(lambda df: df[['col1', 'col3']])  # Select only col1 and col3 columns
)

df_sorted_selected

	col1	col3
3	4	12
2	3	11
1	2	10
0	1	9