Stream-AD · MKasaei00 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,14 @@
+# cpp object files
+*.o
+
+# results files
+results/*.csv
+
+# preprocessed data labels
+data/*/Label_*_*.csv
+
+# main executable file
+main
+
+# idea
+.idea
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/README.md b/README.md
@@ -23,8 +23,15 @@ AnoGraph and AnoGraph-K detect graph anomalies by first mapping the graph to a h
 
 ## Demo
 
-1. To run on DARPA dataset `bash demo.sh DARPA`
-2. To run on ISCX dataset `bash demo.sh ISCX`
+1. To run on DARPA dataset 
+```
+bash demo.sh DARPA
+```
+
+2. To run on ISCX dataset 
+```
+bash demo.sh ISCX
+```
 
 ## Datasets
 1. [DARPA](http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html)

diff --git a/code/README.MD b/code/README.MD
@@ -0,0 +1,26 @@
+# توضیح بخش‌های مختلف کد
+
+## process_data.py
+
+در این فایل پردازش داده‌ه‌ها به صورت زیر انجام می‌شود.
+
+داده‌های زمانی براساس پارامتر مشخصی گسسته‌سازی می‌شوند و به ازای هر 
+time_stamp
+زمان یک برچسب مشخص می‌شود.
+
+روش تضمیم‌گیری برای برچسب به این‌صورت است که اگر بیش از تعداد مشخصی از یال‌های آن دسته‌ی زمانی برچسب مثبت داشته باشند آن لحظه‌ی زمانی به صورت کامل مثبت اعلام می‌شود.
+
+```python
+def generate_final_labels(edge_threshold, record_labels, time_param):
+    data = pd.DataFrame(np.array(record_labels))
+    labels = []
+    data[2] = (data[2] / time_param).astype(int)
+
+    for i in pd.unique(data[2]):
+        labels.append(sum(data[data[2] == i][3]))
+
+    labels = np.array(labels)
+    labels = labels >= edge_threshold
+    labels = labels * 1
+    return labels
+```
diff --git a/code/anoedgeglobal.cpp b/code/anoedgeglobal.cpp
@@ -23,10 +23,10 @@ vector<double> AnoedgeGlobal::getScores() {
     vector<double> scores;
     Hcms count(rows, buckets);
 
-    int num_records = src.size();
+    size_t num_records = src.size();
     int last_time = 0;
 
-    for (int i = 0; i < num_records; i++) {
+    for (size_t i = 0; i < num_records; i++) {
         if (times[i] - last_time > 0) {
             count.decay(decay_factor);
         }
@@ -51,20 +51,20 @@ void AnoedgeGlobal::run() {
 }
 
 double AnoedgeGlobal::getAnoedgeglobalDensity(vector<vector<double>>& mat, int src, int dst) {
-	int num_rows = mat.size();
-	int num_cols = mat[0].size();
+	size_t num_rows = mat.size();
+    size_t num_cols = mat[0].size();
 
 	bool row_flag[num_rows];
 	bool col_flag[num_cols];
 
 	double row_slice_sum[num_rows];
 	double col_slice_sum[num_cols];
 
-	for (int i = 0; i < num_rows; i++) {
+	for (size_t i = 0; i < num_rows; i++) {
 		row_flag[i] = false;
 		row_slice_sum[i] = mat[i][dst];
 	}
-	for (int i = 0; i < num_cols; i++) {
+	for (size_t i = 0; i < num_cols; i++) {
 		col_flag[i] = false;
 		col_slice_sum[i] = mat[src][i];
 	}
@@ -75,14 +75,14 @@ double AnoedgeGlobal::getAnoedgeglobalDensity(vector<vector<double>>& mat, int s
 	col_slice_sum[dst] = mat[src][dst];
 
 	pair<int, double> max_row = {-1, -1.0};
-	for (int i = 0; i < num_rows; i++) {
+	for (size_t i = 0; i < num_rows; i++) {
 		if (!row_flag[i] && (row_slice_sum[i] >= max_row.second)) {
 			max_row = {i, row_slice_sum[i]};
 		}
 	}
 
 	pair<int, double> max_col = {-1, -1.0};
-	for (int i = 0; i < num_cols; i++) {
+	for (size_t i = 0; i < num_cols; i++) {
 		if (!col_flag[i] && (col_slice_sum[i] >= max_col.second)) {
 			max_col = {i, col_slice_sum[i]};
 		}
@@ -94,14 +94,14 @@ double AnoedgeGlobal::getAnoedgeglobalDensity(vector<vector<double>>& mat, int s
 	double cur_mat_sum = mat[src][dst];
 	double output = cur_mat_sum/sqrt(marked_rows*marked_cols);
 
-	int ctr = num_rows + num_cols - 2;
+	size_t ctr = num_rows + num_cols - 2;
 	while (ctr--) {
 		if (max_row.second >= max_col.second) {
 			row_flag[max_row.first] = true;
 			marked_rows++;
 
 			max_col = {-1, -1.0};
-			for (int i = 0; i < num_cols; i++) {
+			for (size_t i = 0; i < num_cols; i++) {
 				if (col_flag[i]) {
 					cur_mat_sum = cur_mat_sum + mat[max_row.first][i];
 				} else {
@@ -113,7 +113,7 @@ double AnoedgeGlobal::getAnoedgeglobalDensity(vector<vector<double>>& mat, int s
 			}
 
 			max_row = {-1, -1.0};
-			for (int i = 0; i < num_rows; i++) {
+			for (size_t i = 0; i < num_rows; i++) {
 				if (!row_flag[i] && (row_slice_sum[i] >= max_row.second)) {
 					max_row = {i, row_slice_sum[i]};
 				}
@@ -123,7 +123,7 @@ double AnoedgeGlobal::getAnoedgeglobalDensity(vector<vector<double>>& mat, int s
 			marked_cols++;
 
 			max_row = {-1, -1.0};
-			for (int i = 0; i < num_rows; i++) {
+			for (size_t i = 0; i < num_rows; i++) {
 				if (row_flag[i]) {
 					cur_mat_sum = cur_mat_sum + mat[i][max_col.first];
 				} else {
@@ -135,7 +135,7 @@ double AnoedgeGlobal::getAnoedgeglobalDensity(vector<vector<double>>& mat, int s
 			}
 
 			max_col = {-1, -1.0};
-			for (int i = 0; i < num_cols; i++) {
+			for (size_t i = 0; i < num_cols; i++) {
 				if (!col_flag[i] && (col_slice_sum[i] >= max_col.second)) {
 					max_col = {i, col_slice_sum[i]};
 				}

diff --git a/code/anograph.cpp b/code/anograph.cpp
@@ -1,5 +1,6 @@
 #include <iostream>
 #include <cmath>
+#include <algorithm>
 
 #include "anograph.hpp"
 #include "hcms.hpp"

diff --git a/code/anographk.cpp b/code/anographk.cpp
@@ -1,6 +1,7 @@
 #include <iostream>
 #include <cmath>
 #include <queue>
+#include <algorithm>
 
 #include "anographk.hpp"
 #include "hcms.hpp"

diff --git a/code/demo.sh b/code/demo.sh
@@ -13,11 +13,25 @@ if [ $1 == "DARPA" ]; then
   ./main anograph_k DARPA 30 50 2 32 5
 
   echo "Running AnoEdge-G"
+  # Algorithm => anoedge_g
+  # Dataset => DARPA
+  # Rows => 2
+  # Buckets => 32
+  # Decay factor => 0.9
   ./main anoedge_g DARPA 2 32 0.9
 
   echo "Running AnoEdge-L"
+  # Algorithm => anoedge_g
+  # Dataset => DARPA
+  # Rows => 2
+  # Buckets => 32
+  # Decay factor => 0.9
   ./main anoedge_l DARPA 2 32 0.9
 
+  echo "Installing python dependencies"
+  pip3 install -r requirements.txt
+
+  echo "Running python metrics"
   python3 metrics.py --dataset DARPA --time_window 30 --edge_threshold 50
 
 fi

diff --git a/code/hcms.cpp b/code/hcms.cpp
@@ -1,4 +1,5 @@
 #include <iostream>
+#include <algorithm>
 
 #include "hcms.hpp"
 #include "anoedgeglobal.hpp"

diff --git a/code/metrics.py b/code/metrics.py
@@ -1,61 +1,48 @@
 import pandas as pd
 import argparse
 from sklearn import metrics
+
 parser = argparse.ArgumentParser()
 parser.add_argument('--dataset', default='DARPA')
 parser.add_argument('--time_window', type=int, default=30)
 parser.add_argument("--edge_threshold", type=int, default=50)
 args = parser.parse_args()
 
+results_path = "../results/"
 
-def print_anoedge_auc_time(base_path, dataset_name, algorithm):
-	data = pd.read_csv(base_path + algorithm + "_" + dataset_name + "_score.csv", names=['score', 'label'], sep=" ")
-	time_values = pd.read_csv(base_path + algorithm + "_" + dataset_name + "_time.csv", names=['avg', 'total'], sep=" ")
 
-	fpr, tpr, _ = metrics.roc_curve(data.label, data.score)
-	auc = metrics.roc_auc_score(data.label, data.score)
+def print_anoedge_auc_time(base_path, dataset_name, algorithm):
+    file_name = base_path + algorithm + "_" + dataset_name
+    print_auc_time(algorithm, dataset_name, file_name)
 
-	print ("%s,%s" % (algorithm, dataset_name))
-	print ("AUC: %.3f" % (auc))
-	print ("Time: %s\n" % (time_values["total"].iloc[1]))
 
 def print_anograph_auc_time(base_path, dataset_name, time_window, edge_threshold, algorithm):
-	data = pd.read_csv(base_path + algorithm + "_" + dataset_name + "_" + str(time_window) + "_" + str(edge_threshold) + "_score.csv", names=['score', 'label'], sep=" ")
-	time_values = pd.read_csv(base_path + algorithm + "_" + dataset_name + "_" + str(time_window) + "_" + str(edge_threshold) + "_time.csv", names=['avg', 'total'], sep=" ")
-
-	fpr, tpr, _ = metrics.roc_curve(data.label, data.score)
-	auc = metrics.roc_auc_score(data.label, data.score)
+    file_name = base_path + algorithm + "_" + dataset_name + "_" + str(time_window) + "_" + str(edge_threshold)
+    print_auc_time(algorithm, dataset_name, file_name)
 
-	print ("%s,%s" % (algorithm, dataset_name))
-	print ("AUC: %.3f" % (auc))
-	print ("Time: %s\n" % (time_values["total"].iloc[1]))
 
+def print_auc_time(algorithm, dataset_name, file_name):
+    data = pd.read_csv(file_name + "_score.csv", names=['score', 'label'], sep=" ")
+    time_values = pd.read_csv(file_name + "_time.csv", names=['avg', 'total'], sep=" ")
 
-if __name__ == "__main__":
-	if args.dataset == 'DARPA':
-		print_anograph_auc_time("../results/", "DARPA", args.time_window, args.edge_threshold, "anograph")
-		print_anograph_auc_time("../results/", "DARPA", args.time_window, args.edge_threshold, "anograph_k")
-
-		print_anoedge_auc_time("../results/", "DARPA", "anoedge_g")
-		print_anoedge_auc_time("../results/", "DARPA", "anoedge_l")
+    fpr, tpr, _ = metrics.roc_curve(data.label, data.score)
+    auc = metrics.roc_auc_score(data.label, data.score)
 
-	if args.dataset == 'ISCX':
-		print_anograph_auc_time("../results/", "ISCX", args.time_window, args.edge_threshold, "anograph")
-		print_anograph_auc_time("../results/", "ISCX", args.time_window, args.edge_threshold, "anograph_k")
+    print("%s,%s" % (algorithm, dataset_name))
+    print("AUC: %.3f" % auc)
+    print("Time: %s\n" % (time_values["total"].iloc[1]))
 
-		print_anoedge_auc_time("../results/", "ISCX", "anoedge_g")
-		print_anoedge_auc_time("../results/", "ISCX", "anoedge_l")
 
-	if args.dataset == 'IDS2018':
-		print_anograph_auc_time("../results/", "IDS2018", args.time_window, args.edge_threshold, "anograph")
-		print_anograph_auc_time("../results/", "IDS2018", args.time_window, args.edge_threshold, "anograph_k")
+def run_with_dataset(dataset_name):
+    print_anograph_auc_time(results_path, dataset_name, args.time_window, args.edge_threshold, "anograph")
+    print_anograph_auc_time(results_path, dataset_name, args.time_window, args.edge_threshold, "anograph_k")
+    print_anoedge_auc_time(results_path, dataset_name, "anoedge_g")
+    print_anoedge_auc_time(results_path, dataset_name, "anoedge_l")
 
-		print_anoedge_auc_time("../results/", "IDS2018", "anoedge_g")
-		print_anoedge_auc_time("../results/", "IDS2018", "anoedge_l")
 
-	if args.dataset == 'DDOS2019':
-		print_anograph_auc_time("../results/", "DDOS2019", args.time_window, args.edge_threshold, "anograph")
-		print_anograph_auc_time("../results/", "DDOS2019", args.time_window, args.edge_threshold, "anograph_k")
-
-		print_anoedge_auc_time("../results/", "DDOS2019", "anoedge_g")
-		print_anoedge_auc_time("../results/", "DDOS2019", "anoedge_l")
+if __name__ == "__main__":
+    datasets = ["DARPA", "ISCX", "IDS2018", "DDOS2019"]
+    if args.dataset in datasets:
+        run_with_dataset(args.dataset)
+    else:
+        print(f"Could not detect dataset {args.dataset}")