蒙特卡洛方法来源于概率研究,广泛应用于机器学习模型优化。文章分析其基本原理及实践应用。
原文标题:一文读懂蒙特卡洛算法:从概率模拟到机器学习模型优化的全方位解析
原文作者:数据派THU
冷月清谈:
怜星夜思:
2、为什么蒙特卡洛方法在机器学习中适用?
3、蒙特卡洛模拟与贝叶斯优化的区别有什么?
原文内容

来源:Deephub Imba本文约5000字,建议阅读5分钟本文将深入探讨蒙特卡洛模拟的原理,并展示其在统计和机器学习中的实际应用。
】
蒙特卡洛方法的起源与发展
蒙特卡洛模拟的基本原理
理解蒙特卡洛模拟
实例:估算圆周率π
随机猜测 vs. 蒙特卡洛方法
#Random Guessing of Pi
导入必要的库
import random
import plotly.graph_objects as go
import numpy as np设置猜测次数
num_guesses = 6
生成单位圆的坐标
theta = np.linspace(0, 2*np.pi, 100)
unit_circle_x = np.cos(theta)
unit_circle_y = np.sin(theta)进行多次猜测
for i in range(num_guesses):
在2到4之间随机猜测pi的值
pi_guess = random.uniform(2, 4)
根据猜测生成圆的坐标
radius = pi_guess / 4
circle_x = radius * np.cos(theta)
circle_y = radius * np.sin(theta)创建散点图
fig = go.Figure()
添加猜测的圆
fig.add_trace(go.Scatter(
x = circle_x,
y = circle_y,
mode=‘lines’,
line=dict(
color=‘blue’,
width=3
),
name=‘Estimated Circle’
))添加单位圆
fig.add_trace(go.Scatter(
x = unit_circle_x,
y = unit_circle_y,
mode=‘lines’,
line=dict(
color=‘green’,
width=3
),
name=‘Unit Circle’
))更新图形布局
fig.update_layout(
title=f"Fig1{chr(97 + i)}: Randomly Guessing Pi: {pi_guess}",
width=600,
height=600,
xaxis=dict(
constrain=“domain”,
range=[-1, 1]
),
yaxis=dict(
scaleanchor=“x”,
scaleratio=1,
range=[-1, 1]
)
)显示图形
fig.show()
#Monte Carlo Estimation of Pi
导入必要的库
import random
import math
import plotly.graph_objects as go
import plotly.io as pio
import numpy as np设置飞镖数量
num_darts = 10000
darts_in_circle = 0存储飞镖坐标
x_coords_in, y_coords_in, x_coords_out, y_coords_out = , , ,
设置图形数量
num_figures = 6
darts_per_figure = num_darts // num_figures生成单位圆
theta = np.linspace(0, 2*np.pi, 100)
unit_circle_x = np.cos(theta)
unit_circle_y = np.sin(theta)模拟投掷飞镖
for i in range(num_darts):
x, y = random.uniform(-1, 1), random.uniform(-1, 1)if math.sqrt(x2 + y2) <= 1:
darts_in_circle += 1
x_coords_in.append(x)
y_coords_in.append(y)
else:
x_coords_out.append(x)
y_coords_out.append(y)if (i + 1) % darts_per_figure == 0:
pi_estimate = 4 * darts_in_circle / (i + 1)estimated_circle_radius = pi_estimate / 4
estimated_circle_x = estimated_circle_radius * np.cos(theta)
estimated_circle_y = estimated_circle_radius * np.sin(theta)fig = go.Figure()
fig.add_trace(go.Scattergl(x=x_coords_in, y=y_coords_in, mode=‘markers’, name=‘Darts Inside Circle’, marker=dict(color=‘green’, size=4, opacity=0.8)))
fig.add_trace(go.Scattergl(x=x_coords_out, y=y_coords_out, mode=‘markers’, name=‘Darts Outside Circle’, marker=dict(color=‘red’, size=4, opacity=0.8)))fig.add_trace(go.Scatter(x=unit_circle_x, y=unit_circle_y, mode=‘lines’, name=‘Unit Circle’, line=dict(color=‘green’, width=3)))
fig.add_trace(go.Scatter(x=estimated_circle_x, y=estimated_circle_y, mode=‘lines’, name=‘Estimated Circle’, line=dict(color=‘blue’, width=3)))fig.update_layout(title=f"Figure {chr(97 + (i + 1) // darts_per_figure - 1)}: Thrown Darts: {(i + 1)}, Estimated Pi: {pi_estimate}", width=600, height=600, xaxis=dict(constrain=“domain”, range=[-1, 1]), yaxis=dict(scaleanchor=“x”, scaleratio=1, range=[-1, 1]), legend=dict(yanchor=“top”, y=0.99, xanchor=“left”, x=0.01))
fig.show()
pio.write_image(fig, f"fig2{chr(97 + (i + 1) // darts_per_figure - 1)}.png")
# 计算估计值与真实π的差异 diff_pi = [abs(estimate - math.pi) for estimate in pi_estimates]
创建图表
fig2g = go.Figure(data=go.Scatter(x=num_darts_thrown, y=diff_pi, mode=‘lines’))
添加标题和标签
fig2g.update_layout(
title=“Fig2g: Darts Thrown vs Difference in Estimated Pi”,
xaxis_title=“Number of Darts Thrown”,
yaxis_title=“Difference in Pi”,
)显示图表
fig2g.show()
保存图表
pio.write_image(fig2g, “fig2g.png”)

# 500个蒙特卡洛场景,共投掷1,000,000个飞镖 import random import math import plotly.graph_objects as go import numpy as np
num_darts = 1000000
darts_in_circle = 0num_scenarios = 500
darts_per_scenario = num_darts // num_scenariosdarts_thrown_list =
pi_diff_list =for i in range(num_darts):
x, y = random.uniform(-1, 1), random.uniform(-1, 1)if math.sqrt(x2 + y2) <= 1:
darts_in_circle += 1if (i + 1) % darts_per_scenario == 0:
pi_estimate = 4 * darts_in_circle / (i + 1)darts_thrown_list.append((i + 1) / 1000)
pi_diff_list.append(abs(pi_estimate - math.pi))fig = go.Figure(data=go.Scattergl(x=darts_thrown_list, y=pi_diff_list, mode=‘markers’))
fig.update_layout(
title=“Fig2h: Difference between Estimated and Actual Pi vs. Number of Darts Thrown (in thousands)”,
xaxis_title=“Number of Darts Thrown (in thousands)”,
yaxis_title=“Difference between Estimated and Actual Pi”,
)
fig.show()
pio.write_image(fig, “fig2h.png”)

蒙特卡洛模拟在机器学习中的应用
心脏病数据集分析
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV from sklearn.metrics import roc_auc_score import numpy as np import plotly.graph_objects as go
url = “https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data”
column_names = [“age”, “sex”, “cp”, “trestbps”, “chol”, “fbs”, “restecg”, “thalach”, “exang”, “oldpeak”, “slope”, “ca”, “thal”, “target”]df = pd.read_csv(url, names=column_names, na_values=“?”)
print(df.head())
数据预处理
from sklearn.impute import SimpleImputer from sklearn.preprocessing import LabelEncoder
print(df.isnull().sum())
imputer = SimpleImputer(strategy=‘median’)
df_filled = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)print(df_filled.head())
categorical_vars = df_filled.select_dtypes(include=‘object’).columns
encoder = LabelEncoder()
for var in categorical_vars:
df_filled[var] = encoder.fit_transform(df_filled[var])scaler = StandardScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df_filled), columns=df_filled.columns)
print(df_normalized.head())
基准模型
from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, roc_auc_score
df_normalized[‘target’] = df[‘target’]
df_normalized[‘target’] = df_normalized[‘target’].apply(lambda x: 1 if x > 0 else 0)X = df_normalized.drop(‘target’, axis=1)
y = df_normalized[‘target’]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)model = LogisticRegression()
model.fit(X_train, y_train)y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print("Baseline Model " + f’Accuracy: {accuracy}‘)
print("Baseline Model " + f’ROC-AUC: {roc_auc}’)

使用网格搜索进行超参数调优
from sklearn.model_selection import GridSearchCV
hyperparameters = {‘C’: [0.001, 0.01, 0.1, 1, 10, 100, 1000],
‘penalty’: [‘l1’, ‘l2’]}grid_search = GridSearchCV(LogisticRegression(), hyperparameters, cv=5, scoring=‘roc_auc’)
grid_search.fit(X_train, y_train)best_params = grid_search.best_params_
print(f’Best hyperparameters: {best_params}')best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
roc_auc_best = roc_auc_score(y_test, y_pred_best)
print("Grid Search Method " + f’Accuracy of the best model: {accuracy_best}‘)
print("Grid Search Method " + f’ROC-AUC of the best model: {roc_auc_best}’)

使用蒙特卡洛方法进行超参数调优
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as npX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
C_range = np.logspace(-3, 3, 7)
penalty_options = [‘l1’, ‘l2’]best_score = 0
best_hyperparams = Nonefor _ in range(1000):
C = np.random.choice(C_range)
penalty = np.random.choice(penalty_options)model = LogisticRegression(C=C, penalty=penalty, solver=‘liblinear’)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)if roc_auc > best_score:
best_score = roc_auc
best_hyperparams = {‘C’: C, ‘penalty’: penalty}print("Monte Carlo Method " + f’Best ROC-AUC: {best_score}‘)
print("Monte Carlo Method " + f’Best hyperparameters: {best_hyperparams}’)best_model = LogisticRegression(**best_hyperparams, solver=‘liblinear’)
best_model.fit(X_train, y_train)y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Monte Carlo Method " + f’Accuracy of the best model: {accuracy}')
