{ "cells": [ { "metadata": { "ExecuteTime": { "end_time": "2025-06-02T05:55:44.467787Z", "start_time": "2025-06-02T05:55:42.040134Z" } }, "cell_type": "code", "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import pandas as pd\n", "import numpy as np\n", "from tqdm import tqdm # 进度条\n", "import time # 计时\n", "from collections import Counter\n", "from sklearn.metrics import accuracy_score, roc_curve, auc, precision_score, recall_score, f1_score, confusion_matrix, \\\n", " classification_report\n", "from sklearn.model_selection import train_test_split # 划分训练集和测试集\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder # 标准化和独热编码\n", "from sklearn.compose import ColumnTransformer # 列转换器,用于预处理\n", "from sklearn.datasets import load_breast_cancer # 加载乳腺癌数据集(最后的任务用于对比)\n", "\n", "plt.rcParams['font.sans-serif'] = ['MiSans']\n", "plt.rcParams['axes.unicode_minus'] = False\n", "\n", "# --- 加载原始数据集 ---\n", "try:\n", " df = pd.read_csv(\"hf://datasets/schooly/online-shoppers-intention/online_shoppers_intention.csv\")\n", " print(\"成功从 Hugging Face🤗 加载 'online_shoppers_intention' 数据集\")\n", "except Exception as e:\n", " print(f\"错误: 数据集联网加载失败: {e}\")\n", " # 如果联网失败,尝试加载本地副本\n", " try:\n", " df = pd.read_csv(\"online_shoppers_intention.csv\")\n", " print(\"成功加载本地数据集 'online_shoppers_intention.csv'\")\n", " except FileNotFoundError:\n", " print(\"错误: 本地也未找到 'online_shoppers_intention.csv'。请确保文件存在或网络连接正常。程序将退出。\")\n", " exit()\n", "\n", "# --- 初步数据探索 (先康康原始数据集) ---\n", "print(\"\\n--- 数据集概览 ---\")\n", "display(df.head())\n", "print(\"\\n--- 数据信息 ---\")\n", "print(df.info())\n", "print(\"\\n--- 缺失值检查 ---\")\n", "print(df.isnull().sum())\n", "\n", "# 目标变量分布\n", "print(\"\\n--- 目标变量 'Revenue' 分布 ---\")\n", "print(df['Revenue'].value_counts(normalize=True)) # 归一化的类别计数\n", "plt.figure(figsize=(6, 4))\n", "sns.countplot(x='Revenue', data=df)\n", "plt.title('目标变量 Revenue 分布')\n", "plt.xlabel('是否产生收益 (Revenue)')\n", "plt.ylabel('访客数量')\n", "plt.xticks([0, 1], ['未产生 (False)', '产生 (True)'])\n", "plt.show()" ], "id": "aa755d8a43b0e60b", "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/grtsinry43/.conda/envs/ml/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "成功从 Hugging Face🤗 加载 'online_shoppers_intention' 数据集\n", "\n", "--- 数据集概览 ---\n" ] }, { "data": { "text/plain": [ " Administrative Administrative_Duration Informational \\\n", "0 0 0.0 0 \n", "1 0 0.0 0 \n", "2 0 0.0 0 \n", "3 0 0.0 0 \n", "4 0 0.0 0 \n", "\n", " Informational_Duration ProductRelated ProductRelated_Duration \\\n", "0 0.0 1 0.000000 \n", "1 0.0 2 64.000000 \n", "2 0.0 1 0.000000 \n", "3 0.0 2 2.666667 \n", "4 0.0 10 627.500000 \n", "\n", " BounceRates ExitRates PageValues SpecialDay Month OperatingSystems \\\n", "0 0.20 0.20 0.0 0.0 Feb 1 \n", "1 0.00 0.10 0.0 0.0 Feb 2 \n", "2 0.20 0.20 0.0 0.0 Feb 4 \n", "3 0.05 0.14 0.0 0.0 Feb 3 \n", "4 0.02 0.05 0.0 0.0 Feb 3 \n", "\n", " Browser Region TrafficType VisitorType Weekend Revenue \n", "0 1 1 1 Returning_Visitor False False \n", "1 2 1 2 Returning_Visitor False False \n", "2 1 9 3 Returning_Visitor False False \n", "3 2 2 4 Returning_Visitor False False \n", "4 3 1 4 Returning_Visitor True False " ], "text/html": [ "
| \n", " | Administrative | \n", "Administrative_Duration | \n", "Informational | \n", "Informational_Duration | \n", "ProductRelated | \n", "ProductRelated_Duration | \n", "BounceRates | \n", "ExitRates | \n", "PageValues | \n", "SpecialDay | \n", "Month | \n", "OperatingSystems | \n", "Browser | \n", "Region | \n", "TrafficType | \n", "VisitorType | \n", "Weekend | \n", "Revenue | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "0 | \n", "0.0 | \n", "0 | \n", "0.0 | \n", "1 | \n", "0.000000 | \n", "0.20 | \n", "0.20 | \n", "0.0 | \n", "0.0 | \n", "Feb | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "Returning_Visitor | \n", "False | \n", "False | \n", "
| 1 | \n", "0 | \n", "0.0 | \n", "0 | \n", "0.0 | \n", "2 | \n", "64.000000 | \n", "0.00 | \n", "0.10 | \n", "0.0 | \n", "0.0 | \n", "Feb | \n", "2 | \n", "2 | \n", "1 | \n", "2 | \n", "Returning_Visitor | \n", "False | \n", "False | \n", "
| 2 | \n", "0 | \n", "0.0 | \n", "0 | \n", "0.0 | \n", "1 | \n", "0.000000 | \n", "0.20 | \n", "0.20 | \n", "0.0 | \n", "0.0 | \n", "Feb | \n", "4 | \n", "1 | \n", "9 | \n", "3 | \n", "Returning_Visitor | \n", "False | \n", "False | \n", "
| 3 | \n", "0 | \n", "0.0 | \n", "0 | \n", "0.0 | \n", "2 | \n", "2.666667 | \n", "0.05 | \n", "0.14 | \n", "0.0 | \n", "0.0 | \n", "Feb | \n", "3 | \n", "2 | \n", "2 | \n", "4 | \n", "Returning_Visitor | \n", "False | \n", "False | \n", "
| 4 | \n", "0 | \n", "0.0 | \n", "0 | \n", "0.0 | \n", "10 | \n", "627.500000 | \n", "0.02 | \n", "0.05 | \n", "0.0 | \n", "0.0 | \n", "Feb | \n", "3 | \n", "3 | \n", "1 | \n", "4 | \n", "Returning_Visitor | \n", "True | \n", "False | \n", "