scikit-learn Pipeline Integration¶
This notebook demonstrates using SpectraKit preprocessing steps inside a scikit-learn pipeline for a classification workflow.
In [ ]:
Copied!
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.svm import SVC
from spectrakit import baseline_als, normalize_snv, smooth_savgol
from spectrakit.sklearn import SpectralTransformer
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.svm import SVC
from spectrakit import baseline_als, normalize_snv, smooth_savgol
from spectrakit.sklearn import SpectralTransformer
Generate Synthetic Classification Data¶
Create two classes of spectra with different peak patterns.
In [ ]:
Copied!
rng = np.random.default_rng(42)
n_samples = 100
n_features = 500
wavenumbers = np.linspace(400, 4000, n_features)
def gaussian(x, c, a, s):
return a * np.exp(-((x - c) ** 2) / (2 * s**2))
# Class 0: peaks at 1000 and 2500
# Class 1: peaks at 1000 and 3000
X = np.zeros((n_samples, n_features))
y = np.zeros(n_samples, dtype=int)
for i in range(n_samples):
# Common peak
base = gaussian(wavenumbers, 1000, 2.0 + rng.normal(0, 0.2), 30)
# Baseline drift
baseline = 0.3 * rng.random() + 0.2 * np.sin(wavenumbers / 800)
noise = rng.normal(0, 0.05, n_features)
if i < n_samples // 2:
# Class 0
discriminant = gaussian(wavenumbers, 2500, 1.5 + rng.normal(0, 0.2), 40)
y[i] = 0
else:
# Class 1
discriminant = gaussian(wavenumbers, 3000, 1.5 + rng.normal(0, 0.2), 40)
y[i] = 1
X[i] = base + discriminant + baseline + noise
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
for i in range(5):
axes[0].plot(wavenumbers, X[i], alpha=0.7)
axes[0].set_title("Class 0 (5 samples)")
for i in range(50, 55):
axes[1].plot(wavenumbers, X[i], alpha=0.7)
axes[1].set_title("Class 1 (5 samples)")
for ax in axes:
ax.set_xlabel("Wavenumber")
ax.set_ylabel("Intensity")
ax.invert_xaxis()
plt.tight_layout()
plt.show()
rng = np.random.default_rng(42)
n_samples = 100
n_features = 500
wavenumbers = np.linspace(400, 4000, n_features)
def gaussian(x, c, a, s):
return a * np.exp(-((x - c) ** 2) / (2 * s**2))
# Class 0: peaks at 1000 and 2500
# Class 1: peaks at 1000 and 3000
X = np.zeros((n_samples, n_features))
y = np.zeros(n_samples, dtype=int)
for i in range(n_samples):
# Common peak
base = gaussian(wavenumbers, 1000, 2.0 + rng.normal(0, 0.2), 30)
# Baseline drift
baseline = 0.3 * rng.random() + 0.2 * np.sin(wavenumbers / 800)
noise = rng.normal(0, 0.05, n_features)
if i < n_samples // 2:
# Class 0
discriminant = gaussian(wavenumbers, 2500, 1.5 + rng.normal(0, 0.2), 40)
y[i] = 0
else:
# Class 1
discriminant = gaussian(wavenumbers, 3000, 1.5 + rng.normal(0, 0.2), 40)
y[i] = 1
X[i] = base + discriminant + baseline + noise
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
for i in range(5):
axes[0].plot(wavenumbers, X[i], alpha=0.7)
axes[0].set_title("Class 0 (5 samples)")
for i in range(50, 55):
axes[1].plot(wavenumbers, X[i], alpha=0.7)
axes[1].set_title("Class 1 (5 samples)")
for ax in axes:
ax.set_xlabel("Wavenumber")
ax.set_ylabel("Intensity")
ax.invert_xaxis()
plt.tight_layout()
plt.show()
Build sklearn Pipeline¶
Combine SpectraKit preprocessing with PCA + SVM.
In [ ]:
Copied!
pipe = SkPipeline(
[
("smooth", SpectralTransformer(smooth_savgol, window_length=11)),
("baseline", SpectralTransformer(baseline_als, lam=1e6, p=0.01)),
("normalize", SpectralTransformer(normalize_snv)),
("pca", PCA(n_components=10)),
("svm", SVC(kernel="rbf", C=1.0)),
]
)
print(pipe)
pipe = SkPipeline(
[
("smooth", SpectralTransformer(smooth_savgol, window_length=11)),
("baseline", SpectralTransformer(baseline_als, lam=1e6, p=0.01)),
("normalize", SpectralTransformer(normalize_snv)),
("pca", PCA(n_components=10)),
("svm", SVC(kernel="rbf", C=1.0)),
]
)
print(pipe)
Cross-Validation¶
In [ ]:
Copied!
scores = cross_val_score(pipe, X, y, cv=5, scoring="accuracy")
print(f"Accuracy: {scores.mean():.3f} +/- {scores.std():.3f}")
print(f"Per-fold scores: {scores}")
scores = cross_val_score(pipe, X, y, cv=5, scoring="accuracy")
print(f"Accuracy: {scores.mean():.3f} +/- {scores.std():.3f}")
print(f"Per-fold scores: {scores}")
Visualize Preprocessing Effect¶
In [ ]:
Copied!
# Apply just the preprocessing steps
preprocess = SkPipeline(
[
("smooth", SpectralTransformer(smooth_savgol, window_length=11)),
("baseline", SpectralTransformer(baseline_als, lam=1e6, p=0.01)),
("normalize", SpectralTransformer(normalize_snv)),
]
)
X_processed = preprocess.fit_transform(X)
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
for i in range(5):
axes[0].plot(wavenumbers, X[i], alpha=0.5)
axes[1].plot(wavenumbers, X_processed[i], alpha=0.5)
axes[0].set_title("Raw")
axes[1].set_title("Preprocessed")
for ax in axes:
ax.set_xlabel("Wavenumber")
ax.invert_xaxis()
plt.tight_layout()
plt.show()
# Apply just the preprocessing steps
preprocess = SkPipeline(
[
("smooth", SpectralTransformer(smooth_savgol, window_length=11)),
("baseline", SpectralTransformer(baseline_als, lam=1e6, p=0.01)),
("normalize", SpectralTransformer(normalize_snv)),
]
)
X_processed = preprocess.fit_transform(X)
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
for i in range(5):
axes[0].plot(wavenumbers, X[i], alpha=0.5)
axes[1].plot(wavenumbers, X_processed[i], alpha=0.5)
axes[0].set_title("Raw")
axes[1].set_title("Preprocessed")
for ax in axes:
ax.set_xlabel("Wavenumber")
ax.invert_xaxis()
plt.tight_layout()
plt.show()
PCA Visualization¶
In [ ]:
Copied!
# Fit PCA on preprocessed data
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_processed)
plt.figure(figsize=(8, 6))
scatter = plt.scatter(
X_pca[:, 0], X_pca[:, 1], c=y, cmap="RdBu", alpha=0.7, edgecolors="k", linewidth=0.5
)
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0] * 100:.1f}% var)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1] * 100:.1f}% var)")
plt.title("PCA of Preprocessed Spectra")
plt.colorbar(scatter, label="Class")
plt.show()
# Fit PCA on preprocessed data
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_processed)
plt.figure(figsize=(8, 6))
scatter = plt.scatter(
X_pca[:, 0], X_pca[:, 1], c=y, cmap="RdBu", alpha=0.7, edgecolors="k", linewidth=0.5
)
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0] * 100:.1f}% var)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1] * 100:.1f}% var)")
plt.title("PCA of Preprocessed Spectra")
plt.colorbar(scatter, label="Class")
plt.show()