机器学习实践指南

机器学习实践详解

本文将深入介绍机器学习的核心概念和实践技巧，帮助你掌握机器学习模型的开发和优化。

数据预处理

数据清洗

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21


import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def preprocess_data(df):
    # 处理缺失值
    df.fillna(df.mean(), inplace=True)
    
    # 删除重复数据
    df.drop_duplicates(inplace=True)
    
    # 标准化
    scaler = StandardScaler()
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    
    return df

# 使用示例
df = pd.read_csv('data.csv')
df_cleaned = preprocess_data(df)

特征工程

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

def feature_engineering(X, y):
    # 特征选择
    selector = SelectKBest(score_func=f_classif, k=10)
    X_selected = selector.fit_transform(X, y)
    
    # 特征组合
    X['feature_interaction'] = X['feature1'] * X['feature2']
    
    # 特征编码
    categorical_columns = X.select_dtypes(include=['object']).columns
    X = pd.get_dummies(X, columns=categorical_columns)
    
    return X

模型训练

线性模型

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# 数据分割
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 训练模型
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)

决策树模型

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# 定义参数网格
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10]
}

# 网格搜索
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

深度学习

神经网络

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32


import tensorflow as tf
from tensorflow.keras import layers, models

def build_neural_network(input_shape):
    model = models.Sequential([
        layers.Dense(128, activation='relu', input_shape=input_shape),
        layers.Dropout(0.3),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# 训练模型
model = build_neural_network((X_train.shape[1],))
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=10),
        tf.keras.callbacks.ModelCheckpoint('best_model.h5')
    ]
)

CNN模型

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19


def build_cnn(input_shape):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(10, activation='softmax')
    ])
    
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

模型评估

性能指标

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16


from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def evaluate_model(y_true, y_pred):
    # 打印分类报告
    print(classification_report(y_true, y_pred))
    
    # 绘制混淆矩阵
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

交叉验证

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17


from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score

def cross_validate_model(model, X, y, cv=5):
    # 定义评分指标
    scorer = make_scorer(f1_score, average='weighted')
    
    # 执行交叉验证
    scores = cross_val_score(
        model, X, y,
        cv=cv,
        scoring=scorer,
        n_jobs=-1
    )
    
    print(f"Cross-validation scores: {scores}")
    print(f"Average score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

模型优化

特征重要性

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17


def analyze_feature_importance(model, feature_names):
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
    else:
        importances = abs(model.coef_[0])
    
    feature_imp = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    })
    
    feature_imp = feature_imp.sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='feature', data=feature_imp.head(20))
    plt.title('Feature Importance')
    plt.show()

学习曲线

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18


from sklearn.model_selection import learning_curve

def plot_learning_curve(model, X, y, cv=5):
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y,
        cv=cv, n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10)
    )
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_scores.mean(axis=1), label='Training score')
    plt.plot(train_sizes, test_scores.mean(axis=1), label='Cross-validation score')
    plt.xlabel('Training examples')
    plt.ylabel('Score')
    plt.title('Learning Curve')
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()

最佳实践

模型开发流程
- 理解业务问题
- 数据收集和清洗
- 特征工程
- 模型选择和训练
- 性能评估和优化
开发建议
- 使用版本控制
- 做好实验记录
- 模型部署考虑
- 定期更新模型

掌握这些机器学习技巧，将帮助你构建高质量的机器学习应用。