{ "cells": [ { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2025-05-17T12:44:09.056878Z", "start_time": "2025-05-17T12:44:06.294335Z" } }, "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import pandas as pd\n", "\n", "try:\n", " df = pd.read_csv(\"hf://datasets/schooly/online-shoppers-intention/online_shoppers_intention.csv\")\n", "except FileNotFoundError:\n", " print(f\"错误: 数据集联网加载失败\")\n", " exit()\n", "\n", "# --- 初步数据探索 ---\n", "print(\"--- 数据集概览 ---\")\n", "print(df.head())\n", "print(\"\\n--- 数据信息 ---\")\n", "df.info()\n", "print(\"\\n--- 描述性统计 ---\")\n", "print(df.describe())\n", "print(\"\\n--- 缺失值检查 ---\")\n", "print(df.isnull().sum())\n", "\n", "# 目标变量分布\n", "print(\"\\n--- 目标变量 'Revenue' 分布 ---\")\n", "print(df['Revenue'].value_counts(normalize=True))\n", "sns.countplot(x='Revenue', data=df)\n", "plt.title('Revenue Distribution')\n", "plt.show()" ], "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/grtsinry43/.conda/envs/ml/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "--- 数据集概览 ---\n", " Administrative Administrative_Duration Informational \\\n", "0 0 0.0 0 \n", "1 0 0.0 0 \n", "2 0 0.0 0 \n", "3 0 0.0 0 \n", "4 0 0.0 0 \n", "\n", " Informational_Duration ProductRelated ProductRelated_Duration \\\n", "0 0.0 1 0.000000 \n", "1 0.0 2 64.000000 \n", "2 0.0 1 0.000000 \n", "3 0.0 2 2.666667 \n", "4 0.0 10 627.500000 \n", "\n", " BounceRates ExitRates PageValues SpecialDay Month OperatingSystems \\\n", "0 0.20 0.20 0.0 0.0 Feb 1 \n", "1 0.00 0.10 0.0 0.0 Feb 2 \n", "2 0.20 0.20 0.0 0.0 Feb 4 \n", "3 0.05 0.14 0.0 0.0 Feb 3 \n", "4 0.02 0.05 0.0 0.0 Feb 3 \n", "\n", " Browser Region TrafficType VisitorType Weekend Revenue \n", "0 1 1 1 Returning_Visitor False False \n", "1 2 1 2 Returning_Visitor False False \n", "2 1 9 3 Returning_Visitor False False \n", "3 2 2 4 Returning_Visitor False False \n", "4 3 1 4 Returning_Visitor True False \n", "\n", "--- 数据信息 ---\n", "\n", "RangeIndex: 12330 entries, 0 to 12329\n", "Data columns (total 18 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Administrative 12330 non-null int64 \n", " 1 Administrative_Duration 12330 non-null float64\n", " 2 Informational 12330 non-null int64 \n", " 3 Informational_Duration 12330 non-null float64\n", " 4 ProductRelated 12330 non-null int64 \n", " 5 ProductRelated_Duration 12330 non-null float64\n", " 6 BounceRates 12330 non-null float64\n", " 7 ExitRates 12330 non-null float64\n", " 8 PageValues 12330 non-null float64\n", " 9 SpecialDay 12330 non-null float64\n", " 10 Month 12330 non-null object \n", " 11 OperatingSystems 12330 non-null int64 \n", " 12 Browser 12330 non-null int64 \n", " 13 Region 12330 non-null int64 \n", " 14 TrafficType 12330 non-null int64 \n", " 15 VisitorType 12330 non-null object \n", " 16 Weekend 12330 non-null bool \n", " 17 Revenue 12330 non-null bool \n", "dtypes: bool(2), float64(7), int64(7), object(2)\n", "memory usage: 1.5+ MB\n", "\n", "--- 描述性统计 ---\n", " Administrative Administrative_Duration Informational \\\n", "count 12330.000000 12330.000000 12330.000000 \n", "mean 2.315166 80.818611 0.503569 \n", "std 3.321784 176.779107 1.270156 \n", "min 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 \n", "50% 1.000000 7.500000 0.000000 \n", "75% 4.000000 93.256250 0.000000 \n", "max 27.000000 3398.750000 24.000000 \n", "\n", " Informational_Duration ProductRelated ProductRelated_Duration \\\n", "count 12330.000000 12330.000000 12330.000000 \n", "mean 34.472398 31.731468 1194.746220 \n", "std 140.749294 44.475503 1913.669288 \n", "min 0.000000 0.000000 0.000000 \n", "25% 0.000000 7.000000 184.137500 \n", "50% 0.000000 18.000000 598.936905 \n", "75% 0.000000 38.000000 1464.157214 \n", "max 2549.375000 705.000000 63973.522230 \n", "\n", " BounceRates ExitRates PageValues SpecialDay \\\n", "count 12330.000000 12330.000000 12330.000000 12330.000000 \n", "mean 0.022191 0.043073 5.889258 0.061427 \n", "std 0.048488 0.048597 18.568437 0.198917 \n", "min 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.014286 0.000000 0.000000 \n", "50% 0.003112 0.025156 0.000000 0.000000 \n", "75% 0.016813 0.050000 0.000000 0.000000 \n", "max 0.200000 0.200000 361.763742 1.000000 \n", "\n", " OperatingSystems Browser Region TrafficType \n", "count 12330.000000 12330.000000 12330.000000 12330.000000 \n", "mean 2.124006 2.357097 3.147364 4.069586 \n", "std 0.911325 1.717277 2.401591 4.025169 \n", "min 1.000000 1.000000 1.000000 1.000000 \n", "25% 2.000000 2.000000 1.000000 2.000000 \n", "50% 2.000000 2.000000 3.000000 2.000000 \n", "75% 3.000000 2.000000 4.000000 4.000000 \n", "max 8.000000 13.000000 9.000000 20.000000 \n", "\n", "--- 缺失值检查 ---\n", "Administrative 0\n", "Administrative_Duration 0\n", "Informational 0\n", "Informational_Duration 0\n", "ProductRelated 0\n", "ProductRelated_Duration 0\n", "BounceRates 0\n", "ExitRates 0\n", "PageValues 0\n", "SpecialDay 0\n", "Month 0\n", "OperatingSystems 0\n", "Browser 0\n", "Region 0\n", "TrafficType 0\n", "VisitorType 0\n", "Weekend 0\n", "Revenue 0\n", "dtype: int64\n", "\n", "--- 目标变量 'Revenue' 分布 ---\n", "Revenue\n", "False 0.845255\n", "True 0.154745\n", "Name: proportion, dtype: float64\n" ] }, { "data": { "text/plain": [ "
" ], "image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAHHCAYAAACiOWx7AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjEsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvc2/+5QAAAAlwSFlzAAAPYQAAD2EBqD+naQAAN7xJREFUeJzt3Xl8TXfi//H3vVlsIZPFUqJKVFBCVIs0mhmlRUqLobUrRVstVXtprcXodBqhQ5pO00q1aC21a1EtY6vUPqi1hJnKgggVWe7vDz/n69b2SZrIDa/n45HHI/eczz3nc+iVV88598bmcDgcAgAAwC3ZC3oCAAAAhQHRBAAAYIBoAgAAMEA0AQAAGCCaAAAADBBNAAAABogmAAAAA0QTAACAAaIJAADAANEEALnUtWtXde3a9Y7sKygoSNOmTbMeT5s2TUFBQUpJSbkj+2/SpImGDx9+R/YFuCr3gp4AgLy1YMECjRgxwnrs5uYmPz8/PfbYYxo4cKDKli1bgLNzXcOHD9fChQutx8WLF5evr68eeughRUREqFmzZrLb//j/Z/7000/697//re7du6tUqVJ/eHt5yZXnBrgCogm4S/Xv318BAQG6fPmyduzYoYULFyo+Pl5Lly5VkSJFCnp6LsnT01MTJkyQJKWnp+vkyZP67rvv1L9/fz366KOaMWOGvLy8rPH/+te/cryP7du3a/r06WrTpk2OwmTXrl1yc3PL8f5y4lZzW7lypWw2W77uH3B1RBNwl3r88cdVu3ZtSVL79u3l4+OjmJgYrVmzRi1btizg2bkmd3d3PfPMM07LBg4cqA8//FDvvfeeRo0apcjISGudp6dnvs4nOztbGRkZKlKkSIGHbn4fK1AYcE8TcI+oX7++JOnEiRNOyw8fPmydSaldu7batm2rNWvWWOt3796toKAgp0tXV61fv15BQUH67rvvrGW//vqrRowYodDQUNWqVUsRERH66quvnJ63ZcsWBQUFafny5ZoxY4YVeN27d9cvv/ziNPZm99Lc6H6iy5cvKyoqSs2aNVOtWrUUHh6uKVOm6PLly4Z/SjfWp08fhYWFaeXKlTp69Ogt5xAXF6eIiAjVqVNHjzzyiNq2baslS5ZIunIf0pQpUyRJTzzxhIKCghQUFKSEhARJV+5bGjdunBYvXqyIiAjVrl1b69evt9Zde0/TVWfOnNGAAQNUr149NWjQQBMmTFB6erq1PiEhQUFBQVqwYMF1z712m7eb243+Hk6cOGH9t1OnTh116NBB69atcxqTk79rwNVxpgm4R5w8eVKSnC67HDx4UB07dlTZsmXVu3dvFS9eXCtWrFC/fv00bdo0NWvWTLVr11bFihW1YsUKtWnTxmmby5cvl7e3t8LCwiRJSUlJ6tChg2w2mzp37ixfX1/98MMPGjlypNLS0tSjRw+n58fExMhms6lnz55KS0vTRx99pMGDB+vLL7/M8fFlZ2fr5ZdfVnx8vDp06KDAwED9/PPP+vTTT3Xs2DH985//zPE2r9W6dWtt2LBBGzduVOXKlW84Zt68eZowYYKeeuopdevWTenp6Tpw4IB27typVq1aqVmzZjp27JiWLl2qESNGyMfHR5Lk6+trbWPz5s1asWKFOnfuLB8fH1WoUOGW83r99ddVoUIFDRo0SDt27FBcXJxSU1OtADJlMrdrJSUl6fnnn9dvv/2mrl27ysfHRwsXLtTLL79sheu18vLvGigoRBNwl0pLS1NKSoouX76snTt3avr06fL09NRf/vIXa8w777yj++67T/Pnz7cuv3Tq1EkdO3bU3//+d+sHX8uWLfXxxx/r3Llz8vb2lnTlrM7q1avVrFkzeXh4SJLef/99ZWVlacmSJdYP3Y4dO+qNN97Q9OnT9fzzz6to0aLW/tPT07Vo0SJr36VKldI777yjn3/+WdWqVcvR8S5ZskQbN25UXFycdVZNkh588EGNHj1aP/30k+rVq5fTP0bL1fkcP378pmPWrVunBx98UFFRUTdcX716ddWsWVNLly5V06ZNFRAQcN2Yo0ePasmSJapatarRvAICAjRjxgxJUufOneXl5aXPP/9cPXv2VPXq1Y22YTq3a3344YdKSkrS7NmzrT/v9u3bq3Xr1po0aZKeeOIJpxvn8/LvGigoXJ4D7lI9evRQo0aNFB4erv79+6tYsWKaMWOGypUrJ0k6e/asNm/erBYtWliBlZKSojNnzigsLEzHjh3Tr7/+KulKNGVkZOibb76xtv/vf/9bqamp1v1RDodD33zzjZo0aSKHw2FtLyUlRWFhYTp//rz27t3rNMe2bds63Stzs0uIJlauXKnAwEBVqVLFad8NGzaUdOUy0R9RvHhxSdKFCxduOqZUqVL63//+p127duV6P4888ohxMElXQulaXbp0kST98MMPuZ6Die+//17BwcFOgVqiRAk999xzOnnypA4dOuQ0Pi//roGCwpkm4C719ttvq3Llyjp//rzmz5+vH3/80emH1vHjx+VwODR16lRNnTr1httITk5W2bJlVb16dVWpUkUrVqxQ+/btJV25NOfj42NFSUpKilJTUzV37lzNnTv3htv7/WcKlS9f3unx1UuHqampOT7eX375RYcPH1ajRo1ueix/xMWLFyVdCYOb6d27tzZu3Kj27durUqVKeuyxx/T000/r4YcfNt7P7c7w/F6lSpWcHt9///2y2+3WvUj55dSpU6pTp851y6tUqWKtv/YMUl7+XQMFhWgC7lLBwcHWu+eaNm2qTp06adCgQVq5cqVKlCih7OxsSVLPnj3VuHHjG27j/vvvt75v2bKlZs6cqZSUFHl5eWnt2rWKiIiQu/uVf0aubq9169bX3ft0VVBQkNPjm33ukcPhuO3xZWVlOb0FPzs7W9WqVXP6jKprXT3Dlls///yzJOc/k98LDAzUypUrtW7dOq1fv17ffPONPv/8c/Xr10/9+/c32s+1ly9z4/cfC3CzjwnIysr6Q/vJqT/ydw24CqIJuAe4ubnpjTfeULdu3TR79mz16dNHFStWlCR5eHgoNDT0ttto2bKlpk+frm+++Ub+/v5KS0tTRESEtd7X19eKMZPtmfL29r7h2YhTp05ZxyBdiZn9+/erUaNG+fJ5QosXL5bNZtNjjz12y3HFixdXy5Yt1bJlS12+fFmvvfaaZs6cqb59+6pIkSJ5PrdffvnF6c/hl19+UXZ2tnXG6uo9aL//Mzx16tR128rJ3MqXL+/0TsKrjhw5Yq0H7jbc0wTcIxo0aKDg4GB9+umnSk9Pl5+fnx599FHNnTtXp0+fvm787y+lBQYGqlq1alq+fLmWL1+u0qVL65FHHrHWu7m56amnntKqVausszK32p6pihUraufOnU4fG/Ddd9/pv//9r9O4Fi1a6Ndff9W8efOu28alS5esy2u58eGHH2rDhg1q2bKlHnjggZuOO3PmjNNjT09PBQYGyuFwKCMjQ5JUrFgxSdL58+dzPZ9rzZ492+nxZ599JunK53RJkpeXl3x8fLRt2zancZ9//vl128rJ3MLDw7Vr1y5t377dWnbx4kXNmzdPFSpUyNF9WUBhwZkm4B7Sq1cvDRgwQAsWLFDHjh01evRoderUSa1atVKHDh1UsWJFJSUlaceOHfrf//6nxYsXOz2/ZcuWioqKUpEiRfTXv/71uksugwYN0pYtW9ShQwe1b99eVatW1blz57R3715t2rRJW7duzfGc27dvr1WrVunFF19UixYtdPz4cS1ZsuS6y2TPPPOMVqxYodGjR2vLli2qV6+esrKydOTIEa1cuVIfffSRdbnyZjIzM/X1119LuvLuwJMnT2rt2rU6cOCAGjRooHHjxt3y+b169ZK/v7/q1asnPz8/HTlyRJ999pnCw8OtTxJ/6KGHJF15p2HLli3l4eGhv/zlL9aN5jmVkJCgl156SY0bN9aOHTu0ePFiPf30007vnGvfvr0+/PBDjRw5UrVq1dK2bdtueJYoJ3Pr06ePli1bpt69e6tr167y9vbWokWLlJCQoGnTpuXJr5wBXA3RBNxDnnzySd1///36+OOP1aFDB1WtWlXz58/X9OnTtXDhQp09e1a+vr6qWbOm+vXrd93zW7ZsqcjISP32229q0aLFdev9/f315Zdf6oMPPtC3336rL774Qn/6059UtWpVDR48OFdzbty4sYYPH67Y2FhNnDhRtWrV0syZM/W3v/3NaZzdbtcHH3ygTz75RF9//bW+/fZbFStWTAEBAeratetNP1vpWpcvX9bQoUMlXTnr4uvrq1q1aqlfv35Gv3vuueee05IlSxQbG6uLFy+qXLly6tq1q1555RVrTHBwsAYMGKA5c+Zo/fr1ys7O1po1a3IdTZGRkZo6daree+89ubu7q0uXLtYxXNWvXz+lpKRo1apVWrFihR5//HF99NFH1900n5O5+fv7a86cOXr33Xf12WefKT09XUFBQZo5c6b+/Oc/5+pYAFdnc3AXHgAAwG1x/hQAAMAA0QQAAGCAaAIAADBANAEAABggmgAAAAwQTQAAAAaIJgAAAANEEwAAgAE+ETyPJSefFx8XCgBA4WCzSX5+JY3GEk15zOEQ0QQAwF2Iy3MAAAAGiCYAAAADRBMAAIABogkAAMAA0QQAAGCAaAIAADBANAEAABggmgAAAAwQTQAAAAaIJgAAAANEEwAAgAGiCQAAwADRBAAAYIBoAgAAMOBe0BNAztjtNtnttoKeBuBSsrMdys52FPQ0ANzliKZCxG636U9/Ki43N04QAtfKysrW2bMXCScA+YpoKkTsdpvc3Owa9fl6HT19rqCnA7iEymW8NaFTY9ntNqIJQL4imgqho6fPaf/JlIKeBgAA9xSu8wAAABggmgAAAAwQTQAAAAaIJgAAAANEEwAAgAGiCQAAwADRBAAAYKBAo+nHH3/USy+9pLCwMAUFBWn16tVO6x0Oh6ZOnaqwsDAFBwerR48eOnbsmNOYs2fPatCgQapXr57q16+vN998UxcuXHAas3//fnXq1Em1a9dWeHi4YmJirpvLihUr1Lx5c9WuXVutWrXS999/n+fHCwAACq8CjaaLFy8qKChIo0ePvuH6mJgYxcXFacyYMZo3b56KFSumXr16KT093RozePBgHTp0SLGxsZo5c6a2bdumt99+21qflpamXr16qXz58lqwYIGGDh2q6dOna+7cudaYn376SYMGDdJf//pXLVq0SE888YT69eunn3/+Of8OHgAAFCoFGk3h4eEaOHCgmjVrdt06h8OhWbNm6eWXX1bTpk1VvXp1TZkyRadPn7bOSB0+fFjr16/XhAkTVKdOHdWvX1+jRo3SsmXL9Ouvv0qSFi9erIyMDE2cOFEPPvigIiIi1LVrV8XGxlr7mjVrlho3bqwXX3xRgYGBev3111WzZk199tlnd+YPAgAAuDyXvacpISFBiYmJCg0NtZaVLFlSderU0fbt2yVJ27dvV6lSpVS7dm1rTGhoqOx2u3bt2iVJ2rFjh+rXry9PT09rTFhYmI4ePapz585ZYxo1auS0/7CwMO3YsSPH87bZ8u8LwK3l5+uPL774unu/TLns755LTEyUJPn5+Tkt9/PzU1JSkiQpKSlJvr6+Tuvd3d3l7e1tPT8pKUkBAQFOY/z9/a113t7eSkpKspbdaD854edXMsfPAfDH+fiUKOgpALjLuWw0FVbJyeflyKdftO7mZucHA3ATZ85cUFZWdkFPA0AhY7OZn/Bw2WgqXbq0JCk5OVllypSxlicnJ6t69eqSrpwxSklJcXpeZmamzp07Zz3f39//ujNGVx9fPbt0ozHJycnXnX0y4XAo36IJwK3x2gOQn1z2nqaAgACVLl1amzZtspalpaVp586dCgkJkSSFhIQoNTVVe/bsscZs3rxZ2dnZCg4OliTVrVtX27ZtU0ZGhjVm48aNqly5sry9va0xmzdvdtr/xo0bVbdu3fw6PAAAUMgUaDRduHBB+/bt0759+yRdufl73759OnXqlGw2m7p166YZM2ZozZo1OnDggIYOHaoyZcqoadOmkqTAwEA1btxYb731lnbt2qX4+HiNHz9eERERKlu2rCSpVatW8vDw0MiRI3Xw4EEtX75cs2bN0gsvvGDNo1u3blq/fr0+/vhjHT58WNOmTdOePXvUpUuXO/+HAgAAXJLN4Si4E9pbtmxRt27drlvepk0bTZ48WQ6HQ1FRUZo3b55SU1P18MMPa/To0apcubI19uzZsxo/frzWrl0ru92uJ598UqNGjVKJEv9378/+/fs1btw47d69Wz4+PurSpYv69OnjtM8VK1YoMjJSJ0+e1AMPPKAhQ4YoPDw8x8eUlJR/9zS5u1+5p6lz5FLtP5ly+ycA94DqFXw1+/WndebMBWVmck8TgJyx2SR/f7N7mgo0mu5GRBNwZxFNAP6InESTy97TBAAA4EqIJgAAAANEEwAAgAGiCQAAwADRBAAAYIBoAgAAMEA0AQAAGCCaAAAADBBNAAAABogmAAAAA0QTAACAAaIJAADAANEEAABggGgCAAAwQDQBAAAYIJoAAAAMEE0AAAAGiCYAAAADRBMAAIABogkAAMAA0QQAAGCAaAIAADBANAEAABggmgAAAAwQTQAAAAaIJgAAAANEEwAAgAGiCQAAwADRBAAAYIBoAgAAMEA0AQAAGCCaAAAADBBNAAAABogmAAAAA0QTAACAAaIJAADAANEEAABggGgCAAAwQDQBAAAYIJoAAAAMEE0AAAAGiCYAAAADRBMAAIABogkAAMAA0QQAAGCAaAIAADBANAEAABggmgAAAAwQTQAAAAaIJgAAAANEEwAAgAGiCQAAwADRBAAAYIBoAgAAMEA0AQAAGCCaAAAADLh0NGVlZSkyMlJNmjRRcHCwmjZtqg8++EAOh8Ma43A4NHXqVIWFhSk4OFg9evTQsWPHnLZz9uxZDRo0SPXq1VP9+vX15ptv6sKFC05j9u/fr06dOql27doKDw9XTEzMnThEAABQSLh0NMXExOiLL77Q22+/reXLl2vw4MH66KOPFBcX5zQmLi5OY8aM0bx581SsWDH16tVL6enp1pjBgwfr0KFDio2N1cyZM7Vt2za9/fbb1vq0tDT16tVL5cuX14IFCzR06FBNnz5dc+fOvaPHCwAAXJdLR9P27dv1xBNP6M9//rMCAgLUvHlzhYWFadeuXZKunGWaNWuWXn75ZTVt2lTVq1fXlClTdPr0aa1evVqSdPjwYa1fv14TJkxQnTp1VL9+fY0aNUrLli3Tr7/+KklavHixMjIyNHHiRD344IOKiIhQ165dFRsbW2DHDgAAXItLR1NISIg2b96so0ePSrpyCS0+Pl6PP/64JCkhIUGJiYkKDQ21nlOyZEnVqVNH27dvl3QlvEqVKqXatWtbY0JDQ2W326342rFjh+rXry9PT09rTFhYmI4ePapz587l+3ECAADX517QE7iVPn36KC0tTS1atJCbm5uysrI0cOBAtW7dWpKUmJgoSfLz83N6np+fn5KSkiRJSUlJ8vX1dVrv7u4ub29v6/lJSUkKCAhwGuPv72+t8/b2Np6zzZaDAwSQp3j9AcipnPy74dLRtGLFCi1ZskTvvfeeqlatqn379mnSpEkqU6aM2rRpU9DTuyE/v5IFPQXgnuTjU6KgpwDgLufS0TRlyhT16dNHERERkqSgoCCdOnVK0dHRatOmjUqXLi1JSk5OVpkyZaznJScnq3r16pKunDFKSUlx2m5mZqbOnTtnPd/f3986M3XV1cdXzziZSk4+r2ve3Jen3Nzs/GAAbuLMmQvKysou6GkAKGRsNvMTHi4dTZcuXZLtd+fN3NzcrI8cCAgIUOnSpbVp0ybVqFFD0pV3wu3cuVMdO3aUdOW+qNTUVO3Zs0e1atWSJG3evFnZ2dkKDg6WJNWtW1eRkZHKyMiQh4eHJGnjxo2qXLlyji7NSZLDoXyLJgC3xmsPQH5y6RvB//KXv2jmzJlat26dEhIS9O233yo2NlZNmzaVJNlsNnXr1k0zZszQmjVrdODAAQ0dOlRlypSxxgQGBqpx48Z66623tGvXLsXHx2v8+PGKiIhQ2bJlJUmtWrWSh4eHRo4cqYMHD2r58uWaNWuWXnjhhQI7dgAA4FpsDofr/r9ZWlqapk6dqtWrV1uX4CIiItSvXz/rnW4Oh0NRUVGaN2+eUlNT9fDDD2v06NGqXLmytZ2zZ89q/PjxWrt2rex2u5588kmNGjVKJUr836Wu/fv3a9y4cdq9e7d8fHzUpUsX9enTJ8dzTkrKv8tz7u5XLs91jlyq/SdTbv8E4B5QvYKvZr/+tM6cuaDMTC7PAcgZm03y9ze7POfS0VQYEU3AnUU0AfgjchJNLn15DgAAwFUQTQAAAAaIJgAAAANEEwAAgAGiCQAAwADRBAAAYIBoAgAAMEA0AQAAGCCaAAAADBBNAAAABogmAAAAA0QTAACAAaIJAADAANEEAABggGgCAAAwQDQBAAAYIJoAAAAMEE0AAAAGiCYAAAADRBMAAIABogkAAMAA0QQAAGCAaAIAADBANAEAABggmgAAAAwQTQAAAAaIJgAAAANEEwAAgAGiCQAAwADRBAAAYIBoAgAAMEA0AQAAGCCaAAAADBBNAAAABogmAAAAA0QTAACAAaIJAADAANEEAABggGgCAAAwQDQBAAAYIJoAAAAMEE0AAAAGiCYAAAADRBMAAIABogkAAMAA0QQAAGCAaAIAADBANAEAABggmgAAAAwQTQAAAAaIJgAAAANEEwAAgAGiCQAAwADRBAAAYIBoAgAAMEA0AQAAGCCaAAAADLh8NP36668aPHiwGjRooODgYLVq1Uq7d++21jscDk2dOlVhYWEKDg5Wjx49dOzYMadtnD17VoMGDVK9evVUv359vfnmm7pw4YLTmP3796tTp06qXbu2wsPDFRMTcycODwAAFBIuHU3nzp1Tx44d5eHhoZiYGC1btkzDhg2Tt7e3NSYmJkZxcXEaM2aM5s2bp2LFiqlXr15KT0+3xgwePFiHDh1SbGysZs6cqW3btuntt9+21qelpalXr14qX768FixYoKFDh2r69OmaO3fuHT1eAADgutwLegK3EhMTo3LlymnSpEnWsooVK1rfOxwOzZo1Sy+//LKaNm0qSZoyZYpCQ0O1evVqRURE6PDhw1q/fr2++uor1a5dW5I0atQo9enTR0OHDlXZsmW1ePFiZWRkaOLEifL09NSDDz6offv2KTY2Vs8999ydPWgAAOCSXPpM09q1a1WrVi31799fjRo10rPPPqt58+ZZ6xMSEpSYmKjQ0FBrWcmSJVWnTh1t375dkrR9+3aVKlXKCiZJCg0Nld1u165duyRJO3bsUP369eXp6WmNCQsL09GjR3Xu3Lkczdlmy78vALeWn68/vvji6+79MuXSZ5pOnDihL774Qi+88IJeeukl7d69WxMmTJCHh4fatGmjxMRESZKfn5/T8/z8/JSUlCRJSkpKkq+vr9N6d3d3eXt7W89PSkpSQECA0xh/f39r3bWXA2/Hz69kzg4SQJ7w8SlR0FMAcJfLVTR169ZN06dPV6lSpZyWp6Wl6ZVXXtGsWbPyZHIOh0O1atXSG2+8IUmqWbOmDh48qDlz5qhNmzZ5so+8lpx8Xg5H/mzbzc3ODwbgJs6cuaCsrOyCngaAQsZmMz/hkato2rp1qzIyMq5bnp6ervj4+Nxs8oZKly6twMBAp2VVqlTRqlWrrPWSlJycrDJlylhjkpOTVb16dUlXzhilpKQ4bSMzM1Pnzp2znu/v72+dmbrq6uOrZ5xMORzKt2gCcGu89gDkpxxF0/79+63vDx06ZF3ekqTs7GytX79eZcuWzbPJ1atXT0ePHnVaduzYMVWoUEGSFBAQoNKlS2vTpk2qUaOGpCtnu3bu3KmOHTtKkkJCQpSamqo9e/aoVq1akqTNmzcrOztbwcHBkqS6desqMjJSGRkZ8vDwkCRt3LhRlStXztGlOQAAcPfKUTQ9++yzstlsstls6t69+3XrixYtqlGjRuXZ5Lp3766OHTtq5syZatGihXbt2qV58+Zp3LhxkiSbzaZu3bppxowZqlSpkgICAjR16lSVKVPGejddYGCgGjdurLfeektjx45VRkaGxo8fr4iICCvwWrVqpQ8++EAjR45U7969dfDgQc2aNUsjRozIs2MBAACFm83hMD+hffLkSTkcDjVt2lRffvml0w3WHh4e8vPzk5ubW55O8LvvvtM//vEPHTt2TAEBAXrhhRfUoUMHa73D4VBUVJTmzZun1NRUPfzwwxo9erQqV65sjTl79qzGjx+vtWvXym6368knn9SoUaNUosT/3R+0f/9+jRs3Trt375aPj4+6dOmiPn365Hi+SUn5d0+Tu/uVe5o6Ry7V/pMpt38CcA+oXsFXs19/WmfOXFBmJvc0AcgZm03y9ze7pylH0YTbI5qAO4toAvBH5CSacv2RA8eOHdOWLVuUnJys7Gznf6heffXV3G4WAADAJeUqmubNm6cxY8bIx8dH/v7+sl3zyVA2m41oAgAAd51cRdOMGTP0+uuv5+qeHwAAgMIoV79G5dy5c2rRokVezwUAAMBl5Sqamjdvrg0bNuT1XAAAAFxWri7PVapUSVOnTtXOnTtVrVo1ubs7b6Zbt255MjkAAABXkatomjt3rooXL66tW7dq69atTuuufuAkAADA3SRX0bR27dq8ngcAAIBLy9U9TQAAAPeaXJ1put3vZJs0aVKuJgMAAOCqchVNqampTo8zMzN18OBBpaamqmHDhnkyMQAAAFeSq2j64IMPrluWnZ2tMWPGqGLFin94UgAAAK4mz+5pstvt6tGjhz799NO82iQAAIDLyNMbwU+cOKHMzMy83CQAAIBLyNXlud/f6O1wOJSYmKh169apTZs2eTIxAAAAV5KraPrPf/7j9Nhut8vX11fDhw9Xu3bt8mRiAAAAriRX0RQXF5fX8wAAAHBpuYqmq1JSUnTkyBFJUpUqVeTr65snkwIAAHA1uYqmixcvavz48fr666+VnZ0tSXJzc9Mzzzyjt956S8WKFcvTSQIAABS0XL17bvLkyfrxxx81Y8YMbdu2Tdu2bdM///lP/fjjj5o8eXJezxEAAKDA5SqaVq1apXfeeUfh4eHy8vKSl5eXwsPDNX78eK1atSqv5wgAAFDgchVNly5dkr+//3XL/fz8dOnSpT88KQAAAFeTq2iqW7euoqKilJ6ebi27dOmSpk+frrp16+bV3AAAAFxGrm4Ef/PNN/Xiiy/q8ccfV/Xq1SVJ+/fvl6enpz7++OM8nSAAAIAryFU0BQUF6ZtvvtGSJUusjxx4+umn1apVKxUtWjRPJwgAAOAKchVN0dHR8vPzU4cOHZyWf/XVV0pJSVGfPn3yZHIAAACuIlf3NM2dO1dVqlS5bvmDDz6oOXPm/OFJAQAAuJpcRVNiYqJKly593XJfX18lJib+4UkBAAC4mlxF03333aeffvrpuuXx8fEqU6bMH54UAACAq8nVPU3t27fXxIkTlZmZqYYNG0qSNm3apHfffVc9e/bM0wkCAAC4glxF04svvqizZ89q7NixysjIkCQVKVJEL774ovr27ZunEwQAAHAFuYomm82mIUOG6JVXXtHhw4dVtGhRPfDAA/L09Mzr+QEAALiEXEXTVSVKlFBwcHBezQUAAMBl5epGcAAAgHsN0QQAAGCAaAIAADBANAEAABggmgAAAAwQTQAAAAaIJgAAAANEEwAAgAGiCQAAwADRBAAAYIBoAgAAMEA0AQAAGCCaAAAADBBNAAAABogmAAAAA0QTAACAAaIJAADAANEEAABggGgCAAAwQDQBAAAYIJoAAAAMEE0AAAAGiCYAAAADhSqaPvzwQwUFBemdd96xlqWnp2vs2LFq0KCBQkJC9NprrykpKcnpeadOnVKfPn1Up04dNWrUSH/729+UmZnpNGbLli1q06aNatWqpWbNmmnBggV35JgAAEDhUGiiadeuXZozZ46CgoKclk+cOFHfffedIiMjFRcXp9OnT+vVV1+11mdlZalv377KyMjQnDlzNHnyZC1cuFBRUVHWmBMnTqhv375q0KCBvv76a3Xv3l2jRo3S+vXr79jxAQAA11YoounChQsaMmSIJkyYIG9vb2v5+fPnNX/+fA0fPlyNGjVSrVq1NHHiRG3fvl07duyQJG3YsEGHDh3Su+++qxo1aig8PFwDBgzQ7NmzdfnyZUnSnDlzFBAQoOHDhyswMFBdunTRU089pU8++aQAjhYAALiiQhFN48aNU3h4uEJDQ52W79mzRxkZGU7LAwMDVb58eSuaduzYoWrVqsnf398aExYWprS0NB06dMga06hRI6dth4WFWdsAAABwL+gJ3M6yZcv0n//8R1999dV165KSkuTh4aFSpUo5Lffz81NiYqI15tpgkmQ9vt2YtLQ0Xbp0SUWLFjWer81mPBRAHuP1ByCncvLvhktH03//+1+98847+vjjj1WkSJGCno4RP7+SBT0F4J7k41OioKcA4C7n0tG0d+9eJScnq23bttayrKws/fjjj5o9e7b+9a9/KSMjQ6mpqU5nm5KTk1W6dGlJV84Y7dq1y2m7V99dd+2Y37/jLikpSV5eXjk6y3Rl3+flcOToKcbc3Oz8YABu4syZC8rKyi7oaQAoZGw28xMeLh1NDRs21JIlS5yWjRgxQlWqVFHv3r113333ycPDQ5s2bdJTTz0lSTpy5IhOnTqlunXrSpLq1q2rmTNnKjk5WX5+fpKkjRs3ysvLS1WrVrXG/PDDD0772bhxo7WNnHA4lG/RBODWeO0ByE8uHU1eXl6qVq2a07LixYvrT3/6k7W8Xbt2mjx5sry9veXl5aUJEyYoJCTECp6wsDBVrVpVQ4cO1ZAhQ5SYmKjIyEh17txZnp6ekqTnn39es2fP1pQpU9SuXTtt3rxZK1asUHR09B09XgAA4LpcOppMvPnmm7Lb7erfv78uX76ssLAwjR492lrv5uammTNnasyYMXruuedUrFgxtWnTRv3797fGVKxYUdHR0Zo0aZJmzZqlcuXKacKECWrcuHFBHBIAAHBBNoeDE9p5KSkp/+5pcne/ck9T58il2n8yJX92AhQy1Sv4avbrT+vMmQvKzOSeJgA5Y7NJ/v5m9zQVis9pAgAAKGhEEwAAgAGiCQAAwADRBAAAYIBoAgAAMEA0AQAAGCCaAAAADBBNAAAABogmAAAAA0QTAACAAaIJAADAANEEAABggGgCAAAwQDQBAAAYIJoAAAAMEE0AAAAGiCYAAAADRBMAAIABogkAAMAA0QQAAGCAaAIAADBANAEAABggmgAAAAwQTQAAAAaIJgAAAANEEwAAgAGiCQAAwADRBAAAYIBoAgAAMEA0AQAAGCCaAAAADBBNAAAABogmAAAAA0QTAACAAaIJAADAANEEAABggGgCAAAwQDQBAAAYIJoAAAAMEE0AAAAGiCYAAAADRBMAAIABogkAAMAA0QQAAGCAaAIAADBANAEAABggmgAAAAwQTQAAAAaIJgAAAANEEwAAgAGiCQAAwADRBAAAYIBoAgAAMEA0AQAAGCCaAAAADBBNAAAABogmAAAAAy4dTdHR0WrXrp1CQkLUqFEjvfLKKzpy5IjTmPT0dI0dO1YNGjRQSEiIXnvtNSUlJTmNOXXqlPr06aM6deqoUaNG+tvf/qbMzEynMVu2bFGbNm1Uq1YtNWvWTAsWLMj34wMAAIWHS0fT1q1b1blzZ82bN0+xsbHKzMxUr169dPHiRWvMxIkT9d133ykyMlJxcXE6ffq0Xn31VWt9VlaW+vbtq4yMDM2ZM0eTJ0/WwoULFRUVZY05ceKE+vbtqwYNGujrr79W9+7dNWrUKK1fv/6OHi8AAHBd7gU9gVv517/+5fR48uTJatSokfbu3atHHnlE58+f1/z58/X3v/9djRo1knQlolq2bKkdO3aobt262rBhgw4dOqTY2Fj5+/urRo0aGjBggP7+97/r1Vdflaenp+bMmaOAgAANHz5ckhQYGKj4+Hh98sknaty48R0/bgAA4Hpc+kzT750/f16S5O3tLUnas2ePMjIyFBoaao0JDAxU+fLltWPHDknSjh07VK1aNfn7+1tjwsLClJaWpkOHDlljrkbXtWOubiMnbLb8+wJwa/n5+uOLL77u3i9TLn2m6VrZ2dmaOHGi6tWrp2rVqkmSkpKS5OHhoVKlSjmN9fPzU2JiojXm2mCSZD2+3Zi0tDRdunRJRYsWNZ6nn1/JnB0YgDzh41OioKcA4C5XaKJp7NixOnjwoD7//POCnsotJSefl8ORP9t2c7PzgwG4iTNnLigrK7ugpwGgkLHZzE94FIpoGjdunNatW6fPPvtM5cqVs5b7+/srIyNDqampTmebkpOTVbp0aWvMrl27nLZ39d111475/TvukpKS5OXllaOzTJLkcCjfognArRX2157dbpPdnoNrBcA9IDvboexs13hxu3Q0ORwOjR8/Xt9++63i4uJUsWJFp/W1atWSh4eHNm3apKeeekqSdOTIEZ06dUp169aVJNWtW1czZ85UcnKy/Pz8JEkbN26Ul5eXqlatao354YcfnLa9ceNGaxsAkN/sdpv+9KficnMrVLeaAvkuKytbZ89edIlwculoGjt2rJYuXap//vOfKlGihHUPUsmSJVW0aFGVLFlS7dq10+TJk+Xt7S0vLy9NmDBBISEhVvCEhYWpatWqGjp0qIYMGaLExERFRkaqc+fO8vT0lCQ9//zzmj17tqZMmaJ27dpp8+bNWrFihaKjowvq0AHcY+x2m9zc7Br1+XodPX2uoKcDuITKZbw1oVNj2e02oul2vvjiC0lS165dnZZPmjRJbdu2lSS9+eabstvt6t+/vy5fvqywsDCNHj3aGuvm5qaZM2dqzJgxeu6551SsWDG1adNG/fv3t8ZUrFhR0dHRmjRpkmbNmqVy5cppwoQJfNwAgDvu6Olz2n8ypaCnAeAGXDqaDhw4cNsxRYoU0ejRo51C6fcqVKigmJiYW26nQYMGWrRoUU6nCAAA7hFcPAcAADBANAEAABggmgAAAAwQTQAAAAaIJgAAAANEEwAAgAGiCQAAwADRBAAAYIBoAgAAMEA0AQAAGCCaAAAADBBNAAAABogmAAAAA0QTAACAAaIJAADAANEEAABggGgCAAAwQDQBAAAYIJoAAAAMEE0AAAAGiCYAAAADRBMAAIABogkAAMAA0QQAAGCAaAIAADBANAEAABggmgAAAAwQTQAAAAaIJgAAAANEEwAAgAGiCQAAwADRBAAAYIBoAgAAMEA0AQAAGCCaAAAADBBNAAAABogmAAAAA0QTAACAAaIJAADAANEEAABggGgCAAAwQDQBAAAYIJoAAAAMEE0AAAAGiCYAAAADRBMAAIABogkAAMAA0QQAAGCAaAIAADBANAEAABggmgAAAAwQTQAAAAaIJgAAAANEEwAAgAGiCQAAwADRBAAAYIBo+p3Zs2erSZMmql27ttq3b69du3YV9JQAAIALIJqusXz5ck2aNEn9+vXTwoULVb16dfXq1UvJyckFPTUAAFDAiKZrxMbGqkOHDmrXrp2qVq2qsWPHqmjRopo/f35BTw0AABQwoun/u3z5svbu3avQ0FBrmd1uV2hoqLZv316AMwMAAK7AvaAn4CrOnDmjrKws+fn5OS338/PTkSNHjLdjt0sOR17Pzln18r4q5slfHSBJlfxLWd/b74L/DeT1DfyfO/H6ttnMx/LKzGO+viXzfR9vdQi9/SDgHuPjU6Kgp5AneH0D13OV1/dd8P9lecPHx0dubm7X3fSdnJwsf3//ApoVAABwFUTT/+fp6amHHnpImzZtspZlZ2dr06ZNCgkJKcCZAQAAV8DluWu88MILGjZsmGrVqqXg4GB9+umn+u2339S2bduCnhoAAChgRNM1WrZsqZSUFEVFRSkxMVE1atTQRx99xOU5AAAgm8OR3+/1AgAAKPy4pwkAAMAA0QQAAGCAaAIAADBANAEAABggmoAcWrBggerXr1/Q0wAA3GF85ADuWcOHD9fChQuvW/7NN9+oUqVKBTAjAHkpKCjolutfffVVvfbaa3doNrgbEE24pzVu3FiTJk1yWubr61tAswGQlzZs2GB9v3z5ckVFRWnlypXWsuLFi1vfOxwOZWVlyd2dH4u4OS7P4Z7m6emp0qVLO33NmjVLrVq1Ut26dRUeHq4xY8bowoULN93G/v371bVrV4WEhKhevXpq27atdu/eba3ftm2bOnXqpODgYIWHh2vChAm6ePHinTg84J527eu6ZMmSstls1uMjR46oXr16+v7779W2bVvVrl1b8fHxGj58uF555RWn7bzzzjvq2rWr9Tg7O1vR0dFq0qSJgoOD1bp1a6cYw92LaAJ+x2azaeTIkVq6dKkmT56szZs36913373p+MGDB6tcuXL66quvtGDBAvXu3VseHh6SpOPHj6t379568skntXjxYr3//vuKj4/X+PHj79ThALiF9957T4MGDdLy5ctveznvqujoaC1atEhjx47VsmXL1KNHDw0ZMkRbt27N59mioHEeEve0devWOf1C5saNGysqKsp6HBAQoNdff12jR4/WmDFjbriNU6dOqVevXgoMDJQkPfDAA9a66OhotWrVSj169LDWjRw5Ul27dtWYMWNUpEiRPD8mAOb69++vxx57zHj85cuXFR0drdjYWOvfjooVKyo+Pl5z587Vo48+ml9ThQsgmnBPa9CggVMMFStWTBs3blR0dLSOHDmitLQ0ZWVlKT09Xb/99puKFSt23TZeeOEFjRo1Sl9//bVCQ0PVvHlz3X///ZKuXLo7cOCAlixZYo13OBzKzs5WQkKCFVoACkbt2rVzNP6XX37Rb7/9pp49ezotz8jIUI0aNfJyanBBRBPuacWKFXN6p1xCQoL69u2rjh07auDAgfL29lZ8fLxGjhypjIyMG0bTa6+9pqefflrff/+9fvjhB0VFRen9999Xs2bNdPHiRT3//PNO90Ncdd999+XrsQG4vd+/pm02m37/K1kzMzOt76/ejxgdHa2yZcs6jfP09MynWcJVEE3ANfbu3SuHw6Hhw4fLbr9yy9+KFStu+7zKlSurcuXK6tGjh9544w3Nnz9fzZo1U82aNXXo0CE+wgAoJHx9fXXw4EGnZfv27bPuUwwMDJSnp6dOnTrFpbh7EDeCA9eoVKmSMjIyFBcXpxMnTmjRokWaM2fOTcdfunRJ48aN05YtW3Ty5EnFx8dr9+7d1mW33r17a/v27Ro3bpz27dunY8eOafXq1Ro3btydOiQAOdCwYUPt2bNHixYt0rFjxxQVFeUUUV5eXurZs6cmTZqkhQsX6vjx49q7d6/i4uJu+LlvuLtwpgm4RvXq1TVixAjFxMToH//4h+rXr6833nhDw4YNu+F4u92us2fPatiwYUpKSpKPj4+efPJJ9e/f39peXFycIiMj1alTJ0lXbhpt2bLlHTsmAOYaN26sV155Re+++67S09PVrl07Pfvss/r555+tMa+//rp8fX0VHR2thIQElSxZUjVr1tRLL71UgDPHnWBz/P7iLQAAAK7D5TkAAAADRBMAAIABogkAAMAA0QQAAGCAaAIAADBANAEAABggmgAAAAwQTQAAAAb4RHAAhc7w4cOtX1nh7u6usmXLqnnz5howYICKFClSwLMDcLcimgAUSo0bN9akSZOUmZmpvXv3atiwYbLZbBoyZEhBTw3AXYrLcwAKJU9PT5UuXVr33XefmjZtqtDQUG3cuFGSlJ2drejoaDVp0kTBwcFq3bq1Vq5caa17/PHH9fnnnztt7z//+Y+qV6+ukydPSpJSU1M1cuRINWzYUPXq1VO3bt20f/9+a/y0adP0zDPPaNGiRWrSpIkefvhhDRw4UGlpadaYJk2a6JNPPnHazzPPPKNp06ZZj2+3HwCug2gCUOj9/PPP2r59uzw8PCRJ0dHRWrRokcaOHatly5apR48eGjJkiLZu3Sq73a6IiAgtXbrUaRtLlixRvXr1VKFCBUnSgAEDlJycrJiYGC1YsEAPPfSQunfvrrNnz1rPOX78uNasWaOZM2cqOjpaP/74o2JiYnI0d5P9AHANXJ4DUCitW7dOISEhyszM1OXLl2W32/XWW2/p8uXLio6OVmxsrEJCQiRJFStWVHx8vObOnatHH31UrVu3VmxsrE6dOqXy5csrOztby5Yt08svvyxJ2rZtm3bt2qVNmzbJ09NTkjRs2DCtXr1aq1at0nPPPSdJcjgcmjRpkry8vCRJrVu31qZNmzRw4ECjYzDdDwDXQDQBKJQaNGigMWPG6LffftMnn3wiNzc3PfXUUzp48KB+++039ezZ02l8RkaGatSoIUmqUaOGAgMDtXTpUvXp00dbt25VSkqKmjdvLkk6cOCALl68qAYNGjht49KlSzp+/Lj1uEKFClYwSVKZMmWUnJxsfAym+wHgGogmAIVSsWLFVKlSJUnSxIkT9cwzz+jLL79UtWrVJF25RFe2bFmn51w9myNJrVq10pIlS9SnTx8tXbpUYWFh8vHxkSRduHBBpUuXVlxc3HX7LVmypPW9u/v1/4Q6HA7re5vNdt36zMxM63vT/QBwDUQTgELPbrerb9++mjx5slauXClPT0+dOnVKjz766E2f8/TTTysyMlJ79uzRqlWrNHbsWGvdQw89pKSkJLm5uSkgICDX8/L19dXp06etx2lpaUpISMjz/QC4M7gRHMBdoXnz5rLb7Zo7d6569uypSZMmaeHChTp+/Lj27t2ruLg467OdJCkgIEAhISEaOXKksrKy1KRJE2tdaGio6tatq379+mnDhg1KSEjQTz/9pPfff1+7d+82nlPDhg21ePFibdu2TQcOHNCwYcNkt//fP7t5tR8AdwZnmgDcFdzd3dWlSxd99NFHWrNmjXx9fRUdHa2EhASVLFlSNWvW1EsvveT0nFatWmns2LF69tlnVbRoUWu5zWbThx9+qMjISI0YMUJnzpyRv7+/6tevL39/f+M59e3bVwkJCerbt69KliypAQMGOJ1pyqv9ALgzbI5rL8ADAADghrg8BwAAYIBoAgAAMEA0AQAAGCCaAAAADBBNAAAABogmAAAAA0QTAACAAaIJAADAANEEAABggGgCAAAwQDQBAAAYIJoAAAAM/D/NUOgAU/bqrgAAAABJRU5ErkJggg==" }, "metadata": {}, "output_type": "display_data" } ], "execution_count": 1 }, { "metadata": { "ExecuteTime": { "end_time": "2025-05-17T12:44:17.327021Z", "start_time": "2025-05-17T12:44:17.235025Z" } }, "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", "from sklearn.compose import ColumnTransformer\n", "\n", "# --- 数据预处理 ---\n", "# 将布尔值转换为整数\n", "df['Weekend'] = df['Weekend'].astype(int)\n", "df['Revenue'] = df['Revenue'].astype(int) # 目标变量\n", "\n", "# 识别类别特征和数值特征\n", "categorical_features = ['Month', 'VisitorType', 'OperatingSystems', 'Browser', 'Region', 'TrafficType']\n", "# 'OperatingSystems', 'Browser', 'Region', 'TrafficType' 是数值类型,但它们代表类别,所以也当类别处理\n", "# 确保将这些数值型类别特征转换为字符串类型,以便OneHotEncoder正确处理\n", "for col in ['OperatingSystems', 'Browser', 'Region', 'TrafficType']:\n", " df[col] = df[col].astype(str)\n", "\n", "numerical_features = ['Administrative', 'Administrative_Duration', 'Informational',\n", " 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',\n", " 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']\n", "\n", "# 创建预处理器\n", "# 对于数值特征:进行标准化\n", "# 对于类别特征:进行独热编码 (One-Hot Encoding)\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", " ('num', StandardScaler(), numerical_features),\n", " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)\n", " ],\n", " remainder='passthrough' # 保留其他未指定列 (如 'Weekend', 'Revenue')\n", ")\n", "\n", "# 分离特征和目标变量\n", "X = df.drop('Revenue', axis=1)\n", "y = df['Revenue']\n", "\n", "# 应用预处理\n", "# 注意:ColumnTransformer 会改变列的顺序和数量\n", "X_processed = preprocessor.fit_transform(X)\n", "\n", "# 如果 X_processed 是稀疏矩阵,转换为密集数组\n", "if hasattr(X_processed, \"toarray\"):\n", " X_processed = X_processed.toarray()\n", "\n", "print(f\"\\n--- 处理后的特征维度 ---\")\n", "print(X_processed.shape)\n", "\n", "# 划分训练集和测试集\n", "# 这里的 random_state 是为了结果可复现\n", "X_train, X_test, y_train, y_test = train_test_split(X_processed, y.values, test_size=0.2, random_state=42, stratify=y)\n", "\n", "print(f\"训练集大小: X_train: {X_train.shape}, y_train: {y_train.shape}\")\n", "print(f\"测试集大小: X_test: {X_test.shape}, y_test: {y_test.shape}\")" ], "id": "1945b351cafe24fb", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "--- 处理后的特征维度 ---\n", "(12330, 74)\n", "训练集大小: X_train: (9864, 74), y_train: (9864,)\n", "测试集大小: X_test: (2466, 74), y_test: (2466,)\n" ] } ], "execution_count": 2 }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# --- 从零实现逻辑回归 ---\n", "class MyLogisticRegression:\n", " def __init__(self, learning_rate=0.01, n_iterations=1000, verbose=False):\n", " self.learning_rate = learning_rate\n", " self.n_iterations = n_iterations\n", " self.weights = None\n", " self.bias = None\n", " self.verbose = verbose # 是否打印训练过程中的损失\n", " self.costs = [] # 记录每次迭代的损失\n", "\n", " def _sigmoid(self, z):\n", " # 防止溢出\n", " z = np.clip(z, -500, 500)\n", " return 1 / (1 + np.exp(-z))\n", "\n", " def fit(self, X, y):\n", " n_samples, n_features = X.shape\n", " # 初始化权重和偏置\n", " self.weights = np.zeros(n_features)\n", " self.bias = 0\n", " self.costs = []\n", "\n", " # 梯度下降\n", " for i in range(self.n_iterations):\n", " # 线性模型: z = X.w + b\n", " linear_model = np.dot(X, self.weights) + self.bias\n", " # 应用sigmoid函数得到预测概率\n", " y_predicted_proba = self._sigmoid(linear_model)\n", "\n", " # 计算梯度\n", " dw = (1 / n_samples) * np.dot(X.T, (y_predicted_proba - y))\n", " db = (1 / n_samples) * np.sum(y_predicted_proba - y)\n", "\n", " # 更新权重和偏置\n", " self.weights -= self.learning_rate * dw\n", " self.bias -= self.learning_rate * db\n", "\n", " # 计算并记录损失 (Binary Cross-Entropy)\n", " # 添加一个小的epsilon防止log(0)\n", " epsilon = 1e-9\n", " cost = - (1 / n_samples) * np.sum(\n", " y * np.log(y_predicted_proba + epsilon) + (1 - y) * np.log(1 - y_predicted_proba + epsilon))\n", " self.costs.append(cost)\n", "\n", " if self.verbose and (i % (self.n_iterations // 10) == 0 or i == self.n_iterations - 1):\n", " print(f\"Iteration {i}, Cost: {cost:.4f}\")\n", "\n", " def predict_proba(self, X):\n", " linear_model = np.dot(X, self.weights) + self.bias\n", " return self._sigmoid(linear_model)\n", "\n", " def predict(self, X, threshold=0.5):\n", " y_predicted_proba = self.predict_proba(X)\n", " y_predicted_labels = [1 if i > threshold else 0 for i in y_predicted_proba]\n", " return np.array(y_predicted_labels)" ], "id": "7b1a931cb634d14b" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# --- 训练自定义逻辑回归模型 ---\n", "print(\"\\n--- 训练自定义逻辑回归模型 ---\")\n", "log_reg_model = MyLogisticRegression(learning_rate=0.1, n_iterations=2000, verbose=True) # 调整参数\n", "log_reg_model.fit(X_train, y_train)\n", "\n", "# 绘制损失曲线\n", "plt.figure()\n", "plt.plot(range(len(log_reg_model.costs)), log_reg_model.costs)\n", "plt.xlabel(\"Iteration\")\n", "plt.ylabel(\"Cost\")\n", "plt.title(\"Logistic Regression Training Cost\")\n", "plt.show()\n", "\n", "# --- 进行预测 ---\n", "y_pred_proba = log_reg_model.predict_proba(X_test) # 获取概率用于ROC曲线\n", "y_pred_labels = log_reg_model.predict(X_test) # 获取类别标签" ], "id": "ed11b643a3bf6061" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "from sklearn.metrics import accuracy_score, roc_curve, auc, precision_score, recall_score, f1_score, confusion_matrix, \\\n", " classification_report\n", "\n", "print(\"\\n--- 模型评估 ---\")\n", "accuracy = accuracy_score(y_test, y_pred_labels)\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "\n", "print(\"\\nClassification Report👀:\")\n", "print(classification_report(y_test, y_pred_labels, target_names=['Will Not Buy (0)', 'Will Buy (1)']))\n", "\n", "print(\"\\n混淆矩阵:\")\n", "cm = confusion_matrix(y_test, y_pred_labels)\n", "sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Will Not Buy', 'Will Buy'],\n", " yticklabels=['Will Not Buy', 'Will Buy'])\n", "plt.xlabel('Predicted')\n", "plt.ylabel('Actual')\n", "plt.title('Confusion Matrix')\n", "plt.show()\n", "\n", "# ROC曲线和AUC\n", "fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)\n", "roc_auc = auc(fpr, tpr)\n", "\n", "plt.figure()\n", "plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')\n", "plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n", "plt.xlim([0.0, 1.0])\n", "plt.ylim([0.0, 1.05])\n", "plt.xlabel('False Positive Rate')\n", "plt.ylabel('True Positive Rate')\n", "plt.title('ROC Curve')\n", "plt.legend(loc=\"lower right\")\n", "plt.show()\n", "\n", "print(f\"AUC: {roc_auc:.4f}\")" ], "id": "1b9c8a29f662d051" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "import numpy as np\n", "from tqdm import tqdm\n", "import time\n", "\n", "\n", "def linear_kernel(X1, X2):\n", " return np.dot(X1, X2.T)\n", "\n", "\n", "def rbf_kernel(X1, X2, gamma=0.1):\n", " if X1.ndim == 1:\n", " X1 = X1[np.newaxis, :]\n", " if X2.ndim == 1:\n", " X2 = X2[np.newaxis, :]\n", " sq_dists = np.sum(X1 ** 2, axis=1)[:, None] + np.sum(X2 ** 2, axis=1) - 2 * np.dot(X1, X2.T)\n", " return np.exp(-gamma * sq_dists)\n", "\n", "\n", "class SMO_SVM:\n", " def __init__(self, C=1.0, kernel='rbf', gamma=0.1, tol=1e-3, max_passes=5):\n", " self.C = C\n", " self.gamma = gamma\n", " self.tol = tol\n", " self.max_passes = max_passes\n", " self.kernel = rbf_kernel if kernel == 'rbf' else linear_kernel\n", " self.alphas = None\n", " self.b = 0\n", " self.X = None\n", " self.y = None\n", "\n", " def fit(self, X, y):\n", " y = np.where(y <= 0, -1, 1)\n", " n_samples, n_features = X.shape\n", " self.X = X\n", " self.y = y\n", " self.alphas = np.zeros(n_samples)\n", " self.b = 0\n", " passes = 0\n", " K = self.kernel(X, X) if self.kernel != rbf_kernel else rbf_kernel(X, X, self.gamma)\n", "\n", " with tqdm(total=self.max_passes, desc=\"SVM Training Progress\") as pbar:\n", " while passes < self.max_passes:\n", " alpha_changed = 0\n", " print(f\"\\nPass {passes + 1}/{self.max_passes}\")\n", " start_time = time.time()\n", " for i in range(n_samples):\n", " Ei = self._E(i, K)\n", " if (y[i] * Ei < -self.tol and self.alphas[i] < self.C) or (\n", " y[i] * Ei > self.tol and self.alphas[i] > 0):\n", " j = np.random.choice([x for x in range(n_samples) if x != i])\n", " Ej = self._E(j, K)\n", "\n", " alpha_i_old = self.alphas[i].copy()\n", " alpha_j_old = self.alphas[j].copy()\n", "\n", " if y[i] != y[j]:\n", " L = max(0, self.alphas[j] - self.alphas[i])\n", " H = min(self.C, self.C + self.alphas[j] - self.alphas[i])\n", " else:\n", " L = max(0, self.alphas[i] + self.alphas[j] - self.C)\n", " H = min(self.C, self.alphas[i] + self.alphas[j])\n", " if L == H:\n", " continue\n", "\n", " eta = 2 * K[i, j] - K[i, i] - K[j, j]\n", " if eta >= 0:\n", " continue\n", "\n", " self.alphas[j] -= y[j] * (Ei - Ej) / eta\n", " self.alphas[j] = np.clip(self.alphas[j], L, H)\n", "\n", " if abs(self.alphas[j] - alpha_j_old) < 1e-5:\n", " continue\n", "\n", " self.alphas[i] += y[i] * y[j] * (alpha_j_old - self.alphas[j])\n", "\n", " b1 = self.b - Ei - y[i] * (self.alphas[i] - alpha_i_old) * K[i, i] - y[j] * (\n", " self.alphas[j] - alpha_j_old) * K[i, j]\n", " b2 = self.b - Ej - y[i] * (self.alphas[i] - alpha_i_old) * K[i, j] - y[j] * (\n", " self.alphas[j] - alpha_j_old) * K[j, j]\n", "\n", " if 0 < self.alphas[i] < self.C:\n", " self.b = b1\n", " elif 0 < self.alphas[j] < self.C:\n", " self.b = b2\n", " else:\n", " self.b = (b1 + b2) / 2\n", "\n", " alpha_changed += 1\n", "\n", " # # Add detailed log for each sample\n", " # if i % 100 == 0 or i == n_samples - 1:\n", " # print(f\" Sample {i + 1}/{n_samples}, alpha_changed: {alpha_changed}\")\n", "\n", " print(\n", " f\"Pass {passes + 1} finished, alpha_changed: {alpha_changed}, time: {time.time() - start_time:.2f}s\")\n", " if alpha_changed == 0:\n", " passes += 1\n", " pbar.update(1)\n", " else:\n", " passes = 0\n", "\n", " def _E(self, i, K):\n", " return self._f(i, K) - self.y[i]\n", "\n", " def _f(self, i, K):\n", " return np.sum(self.alphas * self.y * K[:, i]) + self.b\n", "\n", " def project(self, X):\n", " K = self.kernel(self.X, X) if self.kernel != rbf_kernel else rbf_kernel(self.X, X, self.gamma)\n", " return (self.alphas * self.y) @ K + self.b\n", "\n", " def predict(self, X):\n", " return np.where(self.project(X) >= 0, 1, 0)" ], "id": "6b16c3aa37baadb4" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# --- 训练自定义线性SVM模型 ---\n", "print(\"\\n--- 训练自定义线性SVM模型 ---\")\n", "svm_model = SMO_SVM(C=1.0, kernel='rbf', tol=1e-3, max_passes=5)\n", "svm_model.fit(X_train, y_train)\n", "y_pred_labels_svm = svm_model.predict(X_test)\n", "print(\"\\n--- SVM模型评估 ---\")\n", "accuracy_svm = accuracy_score(y_test, y_pred_labels_svm)\n", "print(f\"Accuracy: {accuracy_svm:.4f}\")\n", "print(\"\\nClassification Report👀:\")\n", "print(classification_report(y_test, y_pred_labels_svm, target_names=['Will Not Buy (0)', 'Will Buy (1)']))\n", "print(\"\\n混淆矩阵:\")\n", "cm_svm = confusion_matrix(y_test, y_pred_labels_svm)\n", "sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Blues', xticklabels=['Will Not Buy', 'Will Buy'],\n", " yticklabels=['Will Not Buy', 'Will Buy'])\n", "plt.xlabel('Predicted')\n", "plt.ylabel('Actual')\n", "plt.title('Confusion Matrix (SVM)')\n", "plt.show()\n", "# ROC曲线和AUC\n", "fpr_svm, tpr_svm, thresholds_svm = roc_curve(y_test, svm_model.project(X_test))\n", "roc_auc_svm = auc(fpr_svm, tpr_svm)\n", "plt.figure()\n", "plt.plot(fpr_svm, tpr_svm, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_svm:.2f})')\n", "plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n", "plt.xlim([0.0, 1.0])\n", "plt.ylim([0.0, 1.05])\n", "plt.xlabel('False Positive Rate')\n", "plt.ylabel('True Positive Rate')\n", "plt.title('ROC Curve (SVM)')\n", "plt.legend(loc=\"lower right\")\n", "plt.show()\n", "\n", "print(f\"AUC: {roc_auc_svm:.4f}\")" ], "id": "439fe97afc87ab54" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "import numpy as np\n", "from collections import Counter\n", "\n", "\n", "class MyDecisionTreeClassifier:\n", " def __init__(self, max_depth=None, min_samples_split=2, criterion='gini'):\n", " self.max_depth = max_depth\n", " self.min_samples_split = min_samples_split\n", " self.criterion = criterion # 'gini' or 'entropy'\n", " self.tree = None\n", "\n", " def _calculate_impurity(self, y):\n", " # 计算y中各类别比例\n", " class_counts = Counter(y)\n", " total_samples = len(y)\n", " impurity = 0\n", " if total_samples == 0:\n", " return 0\n", "\n", " if self.criterion == 'gini':\n", " for cls_count in class_counts.values():\n", " p_k = cls_count / total_samples\n", " impurity += p_k * (1 - p_k) # Gini = sum(pk * (1-pk)) or 1 - sum(pk^2)\n", " # return 1 - impurity # if using 1 - sum(pk^2)\n", " return impurity\n", " elif self.criterion == 'entropy':\n", " for cls_count in class_counts.values():\n", " p_k = cls_count / total_samples\n", " if p_k > 0: # log2(0) is undefined\n", " impurity -= p_k * np.log2(p_k)\n", " return impurity\n", " else:\n", " raise ValueError(\"Unknown criterion.\")\n", "\n", " def _calculate_information_gain(self, X_column, y, threshold):\n", " parent_impurity = self._calculate_impurity(y)\n", "\n", " # 根据阈值划分数据\n", " left_indices = X_column <= threshold\n", " right_indices = X_column > threshold\n", "\n", " y_left, y_right = y[left_indices], y[right_indices]\n", "\n", " if len(y_left) == 0 or len(y_right) == 0:\n", " return 0 # 如果划分导致一个子集为空,则增益为0\n", "\n", " n = len(y)\n", " n_left, n_right = len(y_left), len(y_right)\n", "\n", " impurity_left = self._calculate_impurity(y_left)\n", " impurity_right = self._calculate_impurity(y_right)\n", "\n", " child_impurity = (n_left / n) * impurity_left + (n_right / n) * impurity_right\n", " information_gain = parent_impurity - child_impurity\n", " return information_gain\n", "\n", " def _find_best_split(self, X, y):\n", " best_gain = -1\n", " best_feature_idx = None\n", " best_threshold = None\n", " n_features = X.shape[1]\n", "\n", " for feature_idx in range(n_features):\n", " X_column = X[:, feature_idx]\n", " # 对于数值特征,可能的阈值是排序后唯一值的中间点\n", " # 简化的做法: 尝试每个唯一值作为阈值 (或它们之间的中点)\n", " thresholds = np.unique(X_column)\n", " if len(thresholds) > 10: # 抽样一部分阈值避免计算量过大\n", " thresholds = np.percentile(X_column, np.arange(10, 100, 10))\n", "\n", " for threshold in thresholds:\n", " gain = self._calculate_information_gain(X_column, y, threshold)\n", " if gain > best_gain:\n", " best_gain = gain\n", " best_feature_idx = feature_idx\n", " best_threshold = threshold\n", " return best_feature_idx, best_threshold, best_gain\n", "\n", " def _build_tree(self, X, y, depth=0):\n", " n_samples, n_features = X.shape\n", " n_labels = len(np.unique(y))\n", "\n", " # 停止条件\n", " if (self.max_depth is not None and depth >= self.max_depth) or \\\n", " n_labels == 1 or \\\n", " n_samples < self.min_samples_split:\n", " leaf_value = Counter(y).most_common(1)[0][0] # 叶节点值为多数类\n", " return {'value': leaf_value} # 使用字典表示叶节点\n", "\n", " best_feature_idx, best_threshold, best_gain = self._find_best_split(X, y)\n", "\n", " # 如果信息增益很小,也停止分裂 (避免过拟合)\n", " if best_gain <= 0.001: # 可调参数\n", " leaf_value = Counter(y).most_common(1)[0][0]\n", " return {'value': leaf_value}\n", "\n", " # 划分数据集\n", " left_indices = X[:, best_feature_idx] <= best_threshold\n", " right_indices = X[:, best_feature_idx] > best_threshold\n", "\n", " X_left, y_left = X[left_indices], y[left_indices]\n", " X_right, y_right = X[right_indices], y[right_indices]\n", "\n", " # 确保子集非空,如果一个子集为空,则无法继续分裂,当前节点成为叶节点\n", " if len(y_left) == 0 or len(y_right) == 0:\n", " leaf_value = Counter(y).most_common(1)[0][0]\n", " return {'value': leaf_value}\n", "\n", " # 递归构建左右子树\n", " left_subtree = self._build_tree(X_left, y_left, depth + 1)\n", " right_subtree = self._build_tree(X_right, y_right, depth + 1)\n", "\n", " return {\n", " 'feature_index': best_feature_idx,\n", " 'threshold': best_threshold,\n", " 'left': left_subtree,\n", " 'right': right_subtree,\n", " 'info_gain': best_gain # 可选\n", " }\n", "\n", " def fit(self, X, y):\n", " self.tree = self._build_tree(X, y)\n", "\n", " def _traverse_tree(self, x, node):\n", " if 'value' in node: # 是叶节点\n", " return node['value']\n", "\n", " if x[node['feature_index']] <= node['threshold']:\n", " return self._traverse_tree(x, node['left'])\n", " else:\n", " return self._traverse_tree(x, node['right'])\n", "\n", " def predict(self, X):\n", " return np.array([self._traverse_tree(x, self.tree) for x in X])" ], "id": "81881edd3d6dac8c" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# --- 训练自定义决策树模型 ---\n", "print(\"\\n--- 训练自定义决策树模型 ---\")\n", "tree_model = MyDecisionTreeClassifier(max_depth=5, min_samples_split=2, criterion='gini')\n", "tree_model.fit(X_train, y_train)\n", "y_pred_labels_tree = tree_model.predict(X_test)\n", "print(\"\\n--- 决策树模型评估 ---\")\n", "accuracy_tree = accuracy_score(y_test, y_pred_labels_tree)\n", "print(f\"Accuracy: {accuracy_tree:.4f}\")\n", "print(\"\\nClassification Report👀:\")\n", "print(classification_report(y_test, y_pred_labels_tree, target_names=['Will Not Buy (0)', 'Will Buy (1)']))\n", "print(\"\\n混淆矩阵:\")\n", "cm_tree = confusion_matrix(y_test, y_pred_labels_tree)\n", "sns.heatmap(cm_tree, annot=True, fmt='d', cmap='Blues', xticklabels=['Will Not Buy', 'Will Buy'],\n", " yticklabels=['Will Not Buy', 'Will Buy'])\n", "plt.xlabel('Predicted')\n", "plt.ylabel('Actual')\n", "plt.title('Confusion Matrix (Decision Tree)')\n", "plt.show()\n", "# ROC曲线和AUC\n", "fpr_tree, tpr_tree, thresholds_tree = roc_curve(y_test, y_pred_labels_tree)\n", "roc_auc_tree = auc(fpr_tree, tpr_tree)\n", "plt.figure()\n", "plt.plot(fpr_tree, tpr_tree, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_tree:.2f})')\n", "plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n", "plt.xlim([0.0, 1.0])\n", "plt.ylim([0.0, 1.05])\n", "plt.xlabel('False Positive Rate')\n", "plt.ylabel('True Positive Rate')\n", "plt.title('ROC Curve (Decision Tree)')\n", "plt.legend(loc=\"lower right\")\n", "plt.show()\n", "\n", "print(f\"AUC: {roc_auc_tree:.4f}\")" ], "id": "f82f5a58da4117e1" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "import numpy as np\n", "\n", "\n", "class MyGaussianNaiveBayes:\n", " def __init__(self):\n", " self.class_priors_ = None\n", " self.class_means_ = None\n", " self.class_vars_ = None # 或者 stds_\n", " self.classes_ = None\n", " self.epsilon = 1e-9 # 防止除以零或log(0)\n", "\n", " def fit(self, X, y):\n", " n_samples, n_features = X.shape\n", " self.classes_ = np.unique(y)\n", " n_classes = len(self.classes_)\n", "\n", " self.class_priors_ = np.zeros(n_classes)\n", " self.class_means_ = np.zeros((n_classes, n_features))\n", " self.class_vars_ = np.zeros((n_classes, n_features))\n", "\n", " for idx, c in enumerate(self.classes_):\n", " X_c = X[y == c] # 取出类别c的所有样本\n", " self.class_priors_[idx] = X_c.shape[0] / n_samples\n", " self.class_means_[idx, :] = X_c.mean(axis=0)\n", " self.class_vars_[idx, :] = X_c.var(axis=0) + self.epsilon # 添加epsilon防止方差为0\n", "\n", " def _pdf(self, class_idx, x_row): # x_row是单个样本的特征向量\n", " mean = self.class_means_[class_idx]\n", " var = self.class_vars_[class_idx]\n", " # log_pdf = -0.5 * np.sum(np.log(2. * np.pi * var)) - 0.5 * np.sum(((x_row - mean) ** 2) / var)\n", " # 直接计算概率,但要注意下溢风险,通常用log-sum-exp技巧\n", " numerator = np.exp(-((x_row - mean) ** 2) / (2 * var))\n", " denominator = np.sqrt(2 * np.pi * var)\n", " return numerator / denominator # 这会返回每个特征的P(xj|yk)\n", "\n", " def _calculate_log_class_probability(self, class_idx, x_row):\n", " log_prior = np.log(self.class_priors_[class_idx] + self.epsilon)\n", "\n", " mean = self.class_means_[class_idx]\n", " var = self.class_vars_[class_idx] # var = std^2\n", "\n", " # log( P(xj | yk) ) = -log(sqrt(2*pi*var_j)) - (xj - mean_j)^2 / (2*var_j)\n", " log_likelihood_terms = -0.5 * np.log(2 * np.pi * var) - 0.5 * ((x_row - mean) ** 2) / var\n", " log_likelihood = np.sum(log_likelihood_terms)\n", "\n", " return log_prior + log_likelihood\n", "\n", " def predict_proba(self, X): # 返回每个类别的对数后验概率(未归一化)或归一化概率\n", " n_samples = X.shape[0]\n", " n_classes = len(self.classes_)\n", " log_posteriors = np.zeros((n_samples, n_classes))\n", "\n", " for i in range(n_samples):\n", " for class_idx in range(n_classes):\n", " log_posteriors[i, class_idx] = self._calculate_log_class_probability(class_idx, X[i])\n", "\n", " # 归一化得到概率 (可选,如果只需要类别可以省略)\n", " # log_sum_exp 技巧避免下溢/上溢\n", " max_log = np.max(log_posteriors, axis=1, keepdims=True)\n", " log_posteriors_shifted = log_posteriors - max_log\n", " exp_log_posteriors_shifted = np.exp(log_posteriors_shifted)\n", " sum_exp = np.sum(exp_log_posteriors_shifted, axis=1, keepdims=True)\n", " probabilities = exp_log_posteriors_shifted / sum_exp\n", " return probabilities\n", "\n", " def predict(self, X):\n", " predictions = []\n", " for x_row in X:\n", " posteriors = []\n", " for class_idx, c in enumerate(self.classes_):\n", " log_posterior = self._calculate_log_class_probability(class_idx, x_row)\n", " posteriors.append(log_posterior)\n", " predictions.append(self.classes_[np.argmax(posteriors)])\n", " return np.array(predictions)" ], "id": "b12fa40a4de770e5" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# --- 训练自定义朴素贝叶斯模型 ---\n", "print(\"\\n--- 训练自定义朴素贝叶斯模型 ---\")\n", "nb_model = MyGaussianNaiveBayes()\n", "nb_model.fit(X_train, y_train)\n", "y_pred_labels_nb = nb_model.predict(X_test)\n", "print(\"\\n--- 朴素贝叶斯模型评估 ---\")\n", "accuracy_nb = accuracy_score(y_test, y_pred_labels_nb)\n", "print(f\"Accuracy: {accuracy_nb:.4f}\")\n", "print(\"\\nClassification Report👀:\")\n", "print(classification_report(y_test, y_pred_labels_nb, target_names=['Will Not Buy (0)', 'Will Buy (1)']))\n", "print(\"\\n混淆矩阵:\")\n", "cm_nb = confusion_matrix(y_test, y_pred_labels_nb)\n", "sns.heatmap(cm_nb, annot=True, fmt='d', cmap='Blues', xticklabels=['Will Not Buy', 'Will Buy'],\n", " yticklabels=['Will Not Buy', 'Will Buy'])\n", "plt.xlabel('Predicted')\n", "plt.ylabel('Actual')\n", "plt.title('Confusion Matrix (Naive Bayes)')\n", "plt.show()\n", "# ROC曲线和AUC\n", "fpr_nb, tpr_nb, thresholds_nb = roc_curve(y_test, y_pred_labels_nb)\n", "roc_auc_nb = auc(fpr_nb, tpr_nb)\n", "plt.figure()\n", "plt.plot(fpr_nb, tpr_nb, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_nb:.2f})')\n", "plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n", "plt.xlim([0.0, 1.0])\n", "plt.ylim([0.0, 1.05])\n", "plt.xlabel('False Positive Rate')\n", "plt.ylabel('True Positive Rate')\n", "plt.title('ROC Curve (Naive Bayes)')\n", "plt.legend(loc=\"lower right\")\n", "plt.show()\n", "\n", "print(f\"AUC: {roc_auc_nb:.4f}\")" ], "id": "e8c167999fb13f6b" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "import numpy as np\n", "from collections import Counter\n", "\n", "\n", "class MyKNearestNeighbors:\n", " def __init__(self, k=3):\n", " self.k = k\n", " self.X_train = None\n", " self.y_train = None\n", "\n", " def fit(self, X_train, y_train):\n", " self.X_train = X_train\n", " self.y_train = y_train\n", "\n", " def _euclidean_distance(self, x1, x2):\n", " return np.sqrt(np.sum((x1 - x2) ** 2))\n", "\n", " def _predict_single(self, x_test_sample):\n", " distances = [self._euclidean_distance(x_test_sample, x_train_sample) for x_train_sample in self.X_train]\n", " # 获取k个最近邻的索引\n", " k_indices = np.argsort(distances)[:self.k]\n", " # 获取k个最近邻的标签\n", " k_nearest_labels = [self.y_train[i] for i in k_indices]\n", " # 多数投票\n", " most_common = Counter(k_nearest_labels).most_common(1)\n", " return most_common[0][0]\n", "\n", " def predict(self, X_test):\n", " predictions = [self._predict_single(x_test_sample) for x_test_sample in X_test]\n", " return np.array(predictions)" ], "id": "718bb29ac00a859c" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# --- 训练自定义KNN模型 ---\n", "print(\"\\n--- 训练自定义KNN模型 ---\")\n", "knn_model = MyKNearestNeighbors(k=5) # k值可以调整\n", "knn_model.fit(X_train, y_train)\n", "y_pred_labels_knn = knn_model.predict(X_test)\n", "print(\"\\n--- KNN模型评估 ---\")\n", "accuracy_knn = accuracy_score(y_test, y_pred_labels_knn)\n", "print(f\"Accuracy: {accuracy_knn:.4f}\")\n", "print(\"\\nClassification Report👀:\")\n", "print(classification_report(y_test, y_pred_labels_knn, target_names=['Will Not Buy (0)', 'Will Buy (1)']))\n", "print(\"\\n混淆矩阵:\")\n", "cm_knn = confusion_matrix(y_test, y_pred_labels_knn)\n", "sns.heatmap(cm_knn, annot=True, fmt='d', cmap='Blues', xticklabels=['Will Not Buy', 'Will Buy'],\n", " yticklabels=['Will Not Buy', 'Will Buy'])\n", "plt.xlabel('Predicted')\n", "plt.ylabel('Actual')\n", "plt.title('Confusion Matrix (KNN)')\n", "plt.show()\n", "# ROC曲线和AUC\n", "fpr_knn, tpr_knn, thresholds_knn = roc_curve(y_test, y_pred_labels_knn)\n", "roc_auc_knn = auc(fpr_knn, tpr_knn)\n", "plt.figure()\n", "plt.plot(fpr_knn, tpr_knn, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_knn:.2f})')\n", "plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n", "plt.xlim([0.0, 1.0])\n", "plt.ylim([0.0, 1.05])\n", "plt.xlabel('False Positive Rate')\n", "plt.ylabel('True Positive Rate')\n", "plt.title('ROC Curve (KNN)')\n", "plt.legend(loc=\"lower right\")\n", "plt.show()\n", "\n", "print(f\"AUC: {roc_auc_knn:.4f}\")\n" ], "id": "fc3b39b5d46a42a" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }