This tutorial shows a practical, repeatable workflow for preparing data for Machine Learning using pandas, sklearn.preprocessing, and sklearn.pipeline.
import pandas as pd
df = pd.read_csv("data.csv") # path to your dataset
print(df.shape)
print(df.head())
print(df.dtypes)
from sklearn.datasets import fetch_california_housing
import pandas as pd
data = fetch_california_housing(as_frame=True)
df = data.frame # includes features + target column
print(df.head())
# 1) duplicates
dup_count = df.duplicated().sum()
print("duplicates:", dup_count)
# 2) missing values
print(df.isna().sum().sort_values(ascending=False).head(10))
# 3) basic stats
print(df.describe(include="all").T.head(15))
df = df.drop_duplicates()
# example: cleaning a "city" column
if "city" in df.columns:
df["city"] = df["city"].astype(str).str.strip().str.lower()
# drop rows with any missing values
df_dropped = df.dropna()
# drop columns with too many missing values (example threshold: 40%)
threshold = 0.4
to_drop = [c for c in df.columns if df[c].isna().mean() > threshold]
df = df.drop(columns=to_drop)
print("dropped columns:", to_drop)
You’ll usually impute:
In scikit-learn we do this inside a pipeline (best practice). You’ll see it below.
Common strategies:
Example (pandas one-hot for quick exploration):
df_encoded = pd.get_dummies(df, columns=["city"], drop_first=True) # if "city" exists
Best practice for ML: use OneHotEncoder inside a pipeline (shown later).
Why scale?
Common scalers:
We’ll use StandardScaler in the pipeline.
Typical split:
from sklearn.model_selection import train_test_split
target_col = "target" # change this
X = df.drop(columns=[target_col])
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
X_train, y_train, test_size=0.25, random_state=42
)
# 0.25 of 0.8 = 0.2 => train 60%, val 20%, test 20%
print(X_train.shape, X_val.shape, X_test.shape)
Tip: For classification, use
stratify=yintrain_test_splitto keep class proportions.
A Pipeline chains preprocessing + model steps so you:
We’ll build:
ColumnTransformerPipelineimport pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
# For regression, swap with: from sklearn.linear_model import LinearRegression
# ---- 1) Load data ----
df = pd.read_csv("data.csv") # replace with your path
target_col = "target" # replace with your target column name
X = df.drop(columns=[target_col])
y = df[target_col]
# ---- 2) Split data ----
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# ---- 3) Detect column types ----
numeric_features = X_train.select_dtypes(include=["number"]).columns
categorical_features = X_train.select_dtypes(exclude=["number"]).columns
# ---- 4) Build preprocessors ----
numeric_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler()),
])
categorical_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
])
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
# ---- 5) Build full pipeline (preprocess + model) ----
model = LogisticRegression(max_iter=1000)
clf = Pipeline(steps=[
("preprocess", preprocessor),
("model", model)
])
# ---- 6) Fit ----
clf.fit(X_train, y_train)
# ---- 7) Evaluate ----
print("Train score:", clf.score(X_train, y_train))
print("Test score:", clf.score(X_test, y_test))
Sometimes you want to just transform X:
X_train_ready = preprocessor.fit_transform(X_train)
X_test_ready = preprocessor.transform(X_test)
print(type(X_train_ready), X_train_ready.shape)