{ "cells": [ { "cell_type": "markdown", "id": "fb6a6862-9ab9-47c7-b5da-0bc772897129", "metadata": {}, "source": [ "# Training a ML model using CICIoT2023\n", "\n", "This notebook shows how a LogisticRegression model can be trained using the CICIoT2023 csv files." ] }, { "cell_type": "code", "execution_count": 1, "id": "40f7c50d-b0ae-4f19-9398-1435ba7a851d", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "from tqdm import tqdm\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "from sklearn.linear_model import LogisticRegression" ] }, { "cell_type": "code", "execution_count": 2, "id": "5c40b5d2-727b-4f37-a480-9d46304eb541", "metadata": {}, "outputs": [], "source": [ "DATASET_DIRECTORY = '../CICIoT2023/'" ] }, { "cell_type": "markdown", "id": "3ec1f2b2-92b3-4622-895b-6ac5126f30b4", "metadata": {}, "source": [ "### Importing Dataset" ] }, { "cell_type": "code", "execution_count": 3, "id": "6854f877-5524-46ba-b7ca-5d6040015f44", "metadata": {}, "outputs": [], "source": [ "df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]\n", "df_sets.sort()\n", "training_sets = df_sets[:int(len(df_sets)*.8)]\n", "test_sets = df_sets[int(len(df_sets)*.8):]" ] }, { "cell_type": "code", "execution_count": 4, "id": "0433838d-ca57-4dd8-b41c-ad2ee3df61c4", "metadata": {}, "outputs": [], "source": [ "X_columns = [\n", " 'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',\n", " 'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',\n", " 'rst_flag_number', 'psh_flag_number', 'ack_flag_number',\n", " 'ece_flag_number', 'cwr_flag_number', 'ack_count',\n", " 'syn_count', 'fin_count', 'urg_count', 'rst_count', \n", " 'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',\n", " 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',\n", " 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',\n", " 'Radius', 'Covariance', 'Variance', 'Weight', \n", "]\n", "y_column = 'label'" ] }, { "cell_type": "markdown", "id": "249673a6-4826-4b80-b9aa-dfa4c3d549c4", "metadata": {}, "source": [ "### Scaling" ] }, { "cell_type": "code", "execution_count": 5, "id": "cba40f31", "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n", "scaler = StandardScaler()" ] }, { "cell_type": "code", "execution_count": 6, "id": "3682559f-9eb3-4d35-b1b2-d7d501ab85bc", "metadata": {}, "outputs": [], "source": [ "for train_set in tqdm(training_sets):\n", " scaler.fit(pd.read_csv(DATASET_DIRECTORY + train_set)[X_columns])" ] }, { "cell_type": "markdown", "id": "60abc3f0-e32d-40be-abc5-fd5972cf9856", "metadata": {}, "source": [ "### Classification: 34 (33+1) classes" ] }, { "cell_type": "code", "execution_count": 7, "id": "d208cf46-8ba9-480f-ab99-d4ee81c083b4", "metadata": { "tags": [] }, "outputs": [], "source": [ "ML_models = [\n", " LogisticRegression(n_jobs=-1),\n", "]\n", "\n", "ML_neams = [\n", " \"LogisticRegression\",\n", "]\n", "\n", "for train_set in tqdm(training_sets):\n", " d = pd.read_csv(DATASET_DIRECTORY + train_set)\n", " d[X_columns] = scaler.transform(d[X_columns])\n", " for model in (ML_models):\n", " model.fit(d[X_columns], d[y_column])\n", " del d" ] }, { "cell_type": "code", "execution_count": 8, "id": "6116132e-02f0-4bac-aefb-2ba0bee924ab", "metadata": {}, "outputs": [], "source": [ "y_test = []\n", "preds = {i:[] for i in range(len(ML_models))}\n", "for test_set in tqdm(test_sets):\n", " d_test = pd.read_csv(DATASET_DIRECTORY + test_set)\n", " d_test[X_columns] = scaler.transform(d_test[X_columns])\n", " \n", " y_test += list(d_test[y_column].values)\n", " \n", " for i in range(len(ML_models)):\n", " model = ML_models[i]\n", " y_pred = list(model.predict(d_test[X_columns]))\n", " preds[i] = preds[i] + y_pred\n", " " ] }, { "cell_type": "code", "execution_count": 9, "id": "375dcbfb-2b20-4b37-8fbb-c9d68a6ac541", "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score\n", "for k,v in preds.items():\n", " y_pred = v\n", " print(f\"##### {ML_neams[k]} (34 classes) #####\")\n", " print('accuracy_score: ', accuracy_score(y_pred, y_test))\n", " print('recall_score: ', recall_score(y_pred, y_test, average='macro'))\n", " print('precision_score: ', precision_score(y_pred, y_test, average='macro'))\n", " print('f1_score: ', f1_score(y_pred, y_test, average='macro'))\n", " print()\n", " print()\n", " print()" ] }, { "cell_type": "markdown", "id": "3958c6fa-6d05-48fb-a046-55e5843e4711", "metadata": {}, "source": [ "# Classification: 8 (7+1) classes" ] }, { "cell_type": "code", "execution_count": 10, "id": "9208c899-8b57-4a3a-a2e7-94b057123536", "metadata": {}, "outputs": [], "source": [ "dict_7classes = {}\n", "dict_7classes['DDoS-RSTFINFlood'] = 'DDoS'\n", "dict_7classes['DDoS-PSHACK_Flood'] = 'DDoS'\n", "dict_7classes['DDoS-SYN_Flood'] = 'DDoS'\n", "dict_7classes['DDoS-UDP_Flood'] = 'DDoS'\n", "dict_7classes['DDoS-TCP_Flood'] = 'DDoS'\n", "dict_7classes['DDoS-ICMP_Flood'] = 'DDoS'\n", "dict_7classes['DDoS-SynonymousIP_Flood'] = 'DDoS'\n", "dict_7classes['DDoS-ACK_Fragmentation'] = 'DDoS'\n", "dict_7classes['DDoS-UDP_Fragmentation'] = 'DDoS'\n", "dict_7classes['DDoS-ICMP_Fragmentation'] = 'DDoS'\n", "dict_7classes['DDoS-SlowLoris'] = 'DDoS'\n", "dict_7classes['DDoS-HTTP_Flood'] = 'DDoS'\n", "\n", "dict_7classes['DoS-UDP_Flood'] = 'DoS'\n", "dict_7classes['DoS-SYN_Flood'] = 'DoS'\n", "dict_7classes['DoS-TCP_Flood'] = 'DoS'\n", "dict_7classes['DoS-HTTP_Flood'] = 'DoS'\n", "\n", "\n", "dict_7classes['Mirai-greeth_flood'] = 'Mirai'\n", "dict_7classes['Mirai-greip_flood'] = 'Mirai'\n", "dict_7classes['Mirai-udpplain'] = 'Mirai'\n", "\n", "dict_7classes['Recon-PingSweep'] = 'Recon'\n", "dict_7classes['Recon-OSScan'] = 'Recon'\n", "dict_7classes['Recon-PortScan'] = 'Recon'\n", "dict_7classes['VulnerabilityScan'] = 'Recon'\n", "dict_7classes['Recon-HostDiscovery'] = 'Recon'\n", "\n", "dict_7classes['DNS_Spoofing'] = 'Spoofing'\n", "dict_7classes['MITM-ArpSpoofing'] = 'Spoofing'\n", "\n", "dict_7classes['BenignTraffic'] = 'Benign'\n", "\n", "dict_7classes['BrowserHijacking'] = 'Web'\n", "dict_7classes['Backdoor_Malware'] = 'Web'\n", "dict_7classes['XSS'] = 'Web'\n", "dict_7classes['Uploading_Attack'] = 'Web'\n", "dict_7classes['SqlInjection'] = 'Web'\n", "dict_7classes['CommandInjection'] = 'Web'\n", "\n", "\n", "dict_7classes['DictionaryBruteForce'] = 'BruteForce'" ] }, { "cell_type": "code", "execution_count": 11, "id": "4c1f697f-88d8-4ac4-8bc6-f1a8ac3794d5", "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "\n", "ML_models = [\n", " LogisticRegression(n_jobs=-1),\n", "]\n", "\n", "ML_neams = [\n", " \"LogisticRegression\",\n", "]\n", "\n", "\n", "for train_set in tqdm(training_sets):\n", " d = pd.read_csv(DATASET_DIRECTORY + train_set)\n", " d[X_columns] = scaler.transform(d[X_columns])\n", " new_y = [dict_7classes[k] for k in d[y_column]]\n", " d[y_column] = new_y\n", " \n", " for model in (ML_models):\n", " model.fit(d[X_columns], d[y_column])\n", " del d" ] }, { "cell_type": "code", "execution_count": 12, "id": "6b69c509-7666-45bd-9e11-52ecec0df8a8", "metadata": {}, "outputs": [], "source": [ "y_test = []\n", "preds = {i:[] for i in range(len(ML_models))}\n", "for test_set in tqdm(test_sets):\n", " d_test = pd.read_csv(DATASET_DIRECTORY + test_set)\n", " d_test[X_columns] = scaler.transform(d_test[X_columns])\n", " new_y = [dict_7classes[k] for k in d_test[y_column]]\n", " d_test[y_column] = new_y\n", " \n", " y_test += list(d_test[y_column].values)\n", " \n", " for i in range(len(ML_models)):\n", " model = ML_models[i]\n", " y_pred = list(model.predict(d_test[X_columns]))\n", " preds[i] = preds[i] + y_pred\n", " " ] }, { "cell_type": "code", "execution_count": 13, "id": "3e0a9702-63f5-4898-a8b0-2bf950fe881d", "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score\n", "for k,v in preds.items():\n", " y_pred = v\n", " print(f\"##### {ML_neams[k]} (8 classes) #####\")\n", " print('accuracy_score = ', accuracy_score(y_pred, y_test))\n", " print('recall_score = ', recall_score(y_pred, y_test, average='macro'))\n", " print('precision_score = ', precision_score(y_pred, y_test, average='macro'))\n", " print('f1_score = ', f1_score(y_pred, y_test, average='macro'))\n", " print()\n", " print()\n", " print()" ] }, { "cell_type": "markdown", "id": "a6ecac59-fc02-4198-9910-daf890da7a0a", "metadata": {}, "source": [ "# Classification: 2 (1+1) Classes" ] }, { "cell_type": "code", "execution_count": 14, "id": "90ee4a99-d160-43bc-b2a0-06fa3f49e222", "metadata": {}, "outputs": [], "source": [ "dict_2classes = {}\n", "dict_2classes['DDoS-RSTFINFlood'] = 'Attack'\n", "dict_2classes['DDoS-PSHACK_Flood'] = 'Attack'\n", "dict_2classes['DDoS-SYN_Flood'] = 'Attack'\n", "dict_2classes['DDoS-UDP_Flood'] = 'Attack'\n", "dict_2classes['DDoS-TCP_Flood'] = 'Attack'\n", "dict_2classes['DDoS-ICMP_Flood'] = 'Attack'\n", "dict_2classes['DDoS-SynonymousIP_Flood'] = 'Attack'\n", "dict_2classes['DDoS-ACK_Fragmentation'] = 'Attack'\n", "dict_2classes['DDoS-UDP_Fragmentation'] = 'Attack'\n", "dict_2classes['DDoS-ICMP_Fragmentation'] = 'Attack'\n", "dict_2classes['DDoS-SlowLoris'] = 'Attack'\n", "dict_2classes['DDoS-HTTP_Flood'] = 'Attack'\n", "\n", "dict_2classes['DoS-UDP_Flood'] = 'Attack'\n", "dict_2classes['DoS-SYN_Flood'] = 'Attack'\n", "dict_2classes['DoS-TCP_Flood'] = 'Attack'\n", "dict_2classes['DoS-HTTP_Flood'] = 'Attack'\n", "\n", "\n", "dict_2classes['Mirai-greeth_flood'] = 'Attack'\n", "dict_2classes['Mirai-greip_flood'] = 'Attack'\n", "dict_2classes['Mirai-udpplain'] = 'Attack'\n", "\n", "dict_2classes['Recon-PingSweep'] = 'Attack'\n", "dict_2classes['Recon-OSScan'] = 'Attack'\n", "dict_2classes['Recon-PortScan'] = 'Attack'\n", "dict_2classes['VulnerabilityScan'] = 'Attack'\n", "dict_2classes['Recon-HostDiscovery'] = 'Attack'\n", "\n", "dict_2classes['DNS_Spoofing'] = 'Attack'\n", "dict_2classes['MITM-ArpSpoofing'] = 'Attack'\n", "\n", "dict_2classes['BenignTraffic'] = 'Benign'\n", "\n", "dict_2classes['BrowserHijacking'] = 'Attack'\n", "dict_2classes['Backdoor_Malware'] = 'Attack'\n", "dict_2classes['XSS'] = 'Attack'\n", "dict_2classes['Uploading_Attack'] = 'Attack'\n", "dict_2classes['SqlInjection'] = 'Attack'\n", "dict_2classes['CommandInjection'] = 'Attack'\n", "\n", "dict_2classes['DictionaryBruteForce'] = 'Attack'" ] }, { "cell_type": "code", "execution_count": 15, "id": "506eae35-a310-4a34-8bcf-c99282ed3225", "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "\n", "ML_models = [\n", " LogisticRegression(n_jobs=-1),\n", "]\n", "\n", "ML_neams = [\n", " \"LogisticRegression\",\n", "]\n", "\n", "\n", "for train_set in tqdm(training_sets):\n", " d = pd.read_csv(DATASET_DIRECTORY + train_set)\n", " d[X_columns] = scaler.transform(d[X_columns])\n", " new_y = [dict_2classes[k] for k in d[y_column]]\n", " d[y_column] = new_y\n", " \n", " for model in (ML_models):\n", " model.fit(d[X_columns], d[y_column])\n", " del d" ] }, { "cell_type": "code", "execution_count": 16, "id": "b07aa379-ec7e-4651-ab5a-6845ae249132", "metadata": {}, "outputs": [], "source": [ "y_test = []\n", "preds = {i:[] for i in range(len(ML_models))}\n", "for test_set in tqdm(test_sets):\n", " d_test = pd.read_csv(DATASET_DIRECTORY + test_set)\n", " d_test[X_columns] = scaler.transform(d_test[X_columns])\n", " new_y = [dict_2classes[k] for k in d_test[y_column]]\n", " d_test[y_column] = new_y\n", " \n", " y_test += list(d_test[y_column].values)\n", " \n", " for i in range(len(ML_models)):\n", " model = ML_models[i]\n", " y_pred = list(model.predict(d_test[X_columns]))\n", " preds[i] = preds[i] + y_pred\n", " " ] }, { "cell_type": "code", "execution_count": 17, "id": "caabf4fd-097d-4db2-847a-0dcd87144d6f", "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score\n", "for k,v in preds.items():\n", " y_pred = v\n", " print(f\"##### {ML_neams[k]} (2 classes) #####\")\n", " print('accuracy_score: ', accuracy_score(y_pred, y_test))\n", " print('recall_score: ', recall_score(y_pred, y_test, average='macro'))\n", " print('precision_score: ', precision_score(y_pred, y_test, average='macro'))\n", " print('f1_score: ', f1_score(y_pred, y_test, average='macro'))\n", " print()\n", " print()\n", " print()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }