本文件夹代码集合

💡 快捷键提示：Ctrl+Shift+1/2/3/4 快速复制对应代码 | 双击文件名快速复制

折线图.py

# 1. 先导入库并加载鸢尾花数据集
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np

# 2. 加载数据集并提取变量（定义所有用到的变量）
iris = load_iris()
sample_count = len(iris.data)  # 样本数量
feature_count = iris.data.shape[1]  # 特征数量
class_names = iris.target_names  # 类别名称
feature_names = iris.feature_names  # 特征名称

# 3. 划分训练集/测试集（定义train_size、test_size）
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=42
)
train_size = len(X_train)
test_size = len(X_test)

# 4. 现在再执行你的打印代码
print("=== 鸢尾花数据集数据概览化 ===")
print("数据集信息：")
print(f"- 样本数量：{sample_count}")
print(f"- 特征数量：{feature_count}")
print(f"- 类别名称：{class_names}")
print(f"- 特征名称：{feature_names}")
print("\n数据集标准化：")
print(f"训练集大小：{train_size} 个样本")
print(f"测试集大小：{test_size} 个样本")



2.import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  # 修正库导入

# 1. 定义列名、索引（学生姓名）、成绩数据
a = np.array(["数学", "英语"])  # 科目列
a1 = np.array(["小米", "张三", "李四", "王五"])  # 学生索引
a2 = np.array([[88, 95], [85, 75], [92, 88], [70, 82]])  # 修正数据维度：4行2列（对应4学生、2科目）

# 2. 创建DataFrame
h = pd.DataFrame(data=a2, columns=a, index=a1)
print(h)

# 3. 绘制折线图
plt.plot(h.index, h["数学"], label="数学")
plt.plot(h.index, h["英语"], label="英语")
plt.title('学生成绩折线图')
plt.xlabel("学生姓名")
plt.ylabel("成绩")
plt.legend()  # 显示图例
plt.show()

柱状图.py

import pandas as pd



import numpy as np

import matplotlib.pyplot as plt

# 设置中文字体支持

plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']

plt.rcParams['axes.unicode_minus'] = False

# 1. 定义数据

a = np.array(["数学", "英语"])

a1 = np.array(["小米", "张三", "李四", "王五"])

a2 = np.array([[88, 95], [85, 75], [92, 88], [70, 82]])

# 2. 创建DataFrame

h = pd.DataFrame(data=a2, columns=a, index=a1)

print(h)

print()

# 3. 绘制柱状图并在顶部显示成绩

plt.figure(figsize=(10, 6))

# 设置柱状图的位置

x = np.arange(len(a1))  # 学生位置 [0, 1, 2, 3]

width = 0.35  # 柱子的宽度

# 绘制两组柱子

bars1 = plt.bar(x - width/2, h["数学"], width=width, label='数学', color='skyblue', alpha=0.8)

bars2 = plt.bar(x + width/2, h["英语"], width=width, label='英语', color='lightcoral', alpha=0.8)

# 在柱子顶部显示成绩数值

def add_labels(bars):

    for bar in bars:

        height = bar.get_height()

        plt.text(bar.get_x() + bar.get_width()/2., height,

                f'{height}', ha='center', va='bottom', fontsize=10, fontweight='bold')

add_labels(bars1)

add_labels(bars2)

plt.title('学生成绩柱状图', fontsize=16, fontweight='bold')

plt.xlabel("学生姓名", fontsize=12)

plt.ylabel("成绩", fontsize=12)

plt.xticks(x, a1)  # 设置x轴标签为学生姓名

plt.legend(fontsize=12)

plt.grid(True, linestyle='--', alpha=0.3, axis='y')  # 只显示y轴网格

# 设置y轴范围，让顶部有足够空间显示数值

plt.ylim(0, 105)

plt.tight_layout()

plt.show()

饼图.py

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt



# 设置中文字体支持

plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']

plt.rcParams['axes.unicode_minus'] = False



# 1. 定义数据

a = np.array(["数学", "英语"])

a1 = np.array(["小米", "张三", "李四", "王五"])

a2 = np.array([[88, 95], [85, 75], [92, 88], [70, 82]])



# 2. 创建DataFrame

h = pd.DataFrame(data=a2, columns=a, index=a1)

print(h)



# 3. 绘制饼图 - 显示某个学生的成绩分布

student_name = "小米"  # 可以选择不同的学生

student_scores = h.loc[student_name]



plt.figure(figsize=(10, 8))



# 绘制饼图

colors = ['#ff9999', '#66b3ff']  # 设置颜色

plt.pie(student_scores, labels=student_scores.index, autopct='%1.1f%%', 

        colors=colors, startangle=90, textprops={'fontsize': 12})



plt.title(f'{student_name}的成绩分布饼图', fontsize=16)

plt.axis('equal')  # 保证饼图是圆形



plt.tight_layout()

plt.show()

鸢尾花.py

# 先安装numpy

# python -m pip install numpy==1.21.6 -i https://pypi.tuna.tsinghua.edu.cn/simple/

# # 安装预编译的scipy 1.7.3

# python -m pip install scipy==1.7.3 --only-binary=all -i https://pypi.tuna.tsinghua.edu.cn/simple/

# # 安装scikit-learn 1.0.2

# python -m pip install scikit-learn==1.0.2 -i https://pypi.tuna.tsinghua.edu.cn/simple/



from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split



# 导入鸢尾花数据集

iris = load_iris()



# 划分训练集和测试集

X_train, X_test, y_train, y_test = train_test_split(

    iris.data, iris.target, test_size=0.4, random_state=42

)



# 输出相关信息

print(f"样本数量: {iris.data.shape[0]}")

print(f"特征数量: {iris.data.shape[1]}")

print(f"类别名称: {list(iris.target_names)}")

print(f"特征名称: {list(iris.feature_names)}")

print(f"训练集大小: {X_train.shape[0]}")

print(f"测试集大小: {X_test.shape[0]}")


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 模拟数据：包含房屋的面积、房间数、楼层、建造年份、位置（类别变量），以及房价（目标变量）
data = {
    'area': [70, 85, 100, 120, 60, 150, 200, 80, 95, 110],
    'rooms': [2, 3, 3, 4, 2, 5, 6, 3, 3, 4],
    'floor': [5, 2, 8, 10, 3, 15, 18, 7, 9, 11],
    'year_built': [2005, 2010, 2012, 2015, 2000, 2018, 2020, 2008, 2011, 2016],
    'location': ['Chaoyang', 'Haidian', 'Chaoyang', 'Dongcheng', 'Fengtai', 'Haidian', 'Chaoyang', 'Fengtai', 'Dongcheng', 'Haidian'],
    'price': [5000000, 6000000, 6500000, 7000000, 4500000, 10000000, 12000000, 5500000, 6200000, 7500000]  # 房价（目标变量）
}

# 创建 DataFrame
df = pd.DataFrame(data)

# 查看数据
print("数据预览：")
print(df.head())

# 特征选择
X = df[['area', 'rooms', 'floor', 'year_built', 'location']]  # 特征数据
y = df['price']  # 目标变量

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 数据预处理：数值特征标准化、类别特征 One-Hot 编码
numeric_features = ['area', 'rooms', 'floor', 'year_built']
categorical_features = ['location']

# 构建数值特征子管道 numeric_transformer 即 数值特征预处理：标准化
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])


# 构建类别特征子管道 categorical_transformer 即 类别特征预处理：One-Hot 编码，设置 handle_unknown='ignore'
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # 处理测试集中的新类别
])

# 合并数值和类别特征的处理步骤 即 组合列变换器 ColumnTransformer（预处理器）
#自动区分并处理不同类型特征
# "名称": "num",
# "管道": "numeric_transformer",
#"特征列": "numeric_features"
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# 3. 建立模型，搭建完整模型管道
# 使用线性回归模型，结合数据预处理步骤
#创建一个名为 model_pipeline 的流水线对象
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# 训练模型
model_pipeline.fit(X_train, y_train)

# 对测试集进行预测
y_pred = model_pipeline.predict(X_test)

# 输出预测结果
print("\n预测结果：")
print(y_pred)

# 4. 模型评估：计算均方误差（MSE）和 R² 决定系数
#meansquarederror 衡量“预测值与真实值平均相差多少平方单位”；越小越好，
mse = mean_squared_error(y_test, y_pred)
#衡量“模型相比 baseline（均值）多解释了多少比例的目标方差”，越接近 1 越好，负值说明模型不如“直接猜平均值”
r2 = r2_score(y_test, y_pred)

print("\n模型评估：")
print(f"均方误差 (MSE): {mse:.2f}")
print(f"决定系数 (R²): {r2:.2f}")

# 5. 模型优化：使用网格搜索调整超参数
# 对线性回归的超参数进行调优（仅调整 'fit_intercept'）
param_grid = {
    'regressor__fit_intercept': [True, False],  # 是否拟合截距
}

grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# 输出最佳参数和结果
print("\n最佳参数：")
print(grid_search.best_params_)

# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred_optimized = best_model.predict(X_test)

# 输出优化后的评估结果
mse_opt = mean_squared_error(y_test, y_pred_optimized)
r2_opt = r2_score(y_test, y_pred_optimized)

print("\n优化后的模型评估：")
print(f"均方误差 (MSE): {mse_opt:.2f}")
print(f"决定系数 (R²): {r2_opt:.2f}")

本文件夹代码集合

折线图.py 快速复制

柱状图.py 快速复制

饼图.py 快速复制

鸢尾花.py 快速复制

折线图.py

柱状图.py

饼图.py

鸢尾花.py