讨论详情 - 核OJ

日报
[烧你脑子]SVM模型，懂得都懂（代替360QVM，打破自制杀软旧维度）
AIHuntersystem LV 10 @ 2024-1-13 21:32:19

现在的自制杀软最高的也只是启发式扫描

代码（烧了两个星期的脑子）：

#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <sstream>
#include <cmath>
#include <cstdlib>

using namespace std;

const int MAX_FEATURES = 1000;
const double C = 1.0;
const double TOLERANCE = 0.001;

struct Data {
    vector<double> features;
    int label;
};

struct SVMModel {
    double b;
    vector<double> alphas;
    vector<Data> support_vectors;
};

vector<Data> load_data(string filename) {
    vector<Data> data;
    ifstream file(filename);
    string line;
    while (getline(file, line)) {
        bool has_duplicate = false;
        Data d;
        stringstream ss(line);
        string feature;
        while (getline(ss, feature, ',')) {
            d.features.push_back(stod(feature));
        }
        // Check if the features are duplicates
        for (int i = 0; i < data.size(); i++) {
            bool is_duplicate = true;
            for (int j = 0; j < d.features.size(); j++) {
                if (data[i].features[j] != d.features[j]) {
                    is_duplicate = false;
                    break;
                }
            }
            if (is_duplicate) {
                has_duplicate = true;
                break;
            }
        }
        if (has_duplicate) {
            continue; // Skip this data point
        }
        istringstream label_stream(feature);
        int label;
        label_stream >> label;
        d.label = label;
        data.push_back(d);
    }
    return data;
}

double kernel(vector<double>& x1, vector<double>& x2) {
    double dot_product = 0.0;
    for (int i = 0; i < x1.size(); i++) {
        dot_product += x1[i] * x2[i];
    }
    return dot_product;
}

SVMModel train_svm(vector<Data>& data) {
    int n = data.size();
    vector<double> alphas(n, 0.0);
    double b = 0.0;
    bool is_converged = false;
    while (!is_converged) {
        int num_changed_alphas = 0;
        for (int i = 0; i < n; i++) {
            double Ei = 0.0;
            for (int j = 0; j < n; j++) {
                Ei += alphas[j] * data[j].label * kernel(data[i].features, data[j].features);
            }
            Ei = Ei - (double)data[i].label;
            if ((data[i].label * Ei < -TOLERANCE && alphas[i] < C) ||
                (data[i].label * Ei > TOLERANCE && alphas[i] > 0)) {
                int j = i;
                while (j == i) {
                    j = rand() % n;
                }
                double Ej = 0.0;
                for (int k = 0; k < n; k++) {
                    Ej += alphas[k] * data[k].label * kernel(data[j].features, data[k].features);
                }
                Ej = Ej - (double)data[j].label;
                double alpha_i_old = alphas[i];
                double alpha_j_old = alphas[j];
                double L, H;
                if (data[i].label != data[j].label) {
                    L = max(0.0, alpha_j_old - alpha_i_old);
                    H = min(C, C + alpha_j_old - alpha_i_old);
                }
                else {
                    L = max(0.0, alpha_i_old + alpha_j_old - C);
                    H = min(C, alpha_i_old + alpha_j_old);
                }
                if (L == H) {
                    continue;
                }
                double eta = 2.0 * kernel(data[i].features, data[j].features) -
                    kernel(data[i].features, data[i].features) -
                    kernel(data[j].features, data[j].features);
                if (eta >= 0.0) {
                    continue;
                }
                alphas[j] = alpha_j_old - data[j].label * (Ei - Ej) / eta;
                if (alphas[j] > H) {
                    alphas[j] = H;
                }
                else if (alphas[j] < L) {
                    alphas[j] = L;
                }
                if (abs(alphas[j] - alpha_j_old) < TOLERANCE) {
                    continue;
                }
                alphas[i] = alpha_i_old + data[i].label * data[j].label *
                    (alpha_j_old - alphas[j]);
                double b1 = b - Ei - data[i].label * (alphas[i] - alpha_i_old) *
                    kernel(data[i].features, data[i].features) -
                    data[j].label * (alphas[j] - alpha_j_old) *
                    kernel(data[i].features, data[j].features);
                double b2 = b - Ej - data[i].label * (alphas[i] - alpha_i_old) *
                    kernel(data[i].features, data[j].features) -
                    data[j].label * (alphas[j] - alpha_j_old) *
                    kernel(data[j].features, data[j].features);
                if (0.0 < alphas[i] && alphas[i] < C) {
                    b = b1;
                }
                else if (0.0 < alphas[j] && alphas[j] < C) {
                    b = b2;
                }
                else {
                    b = (b1 + b2) / 2.0;
                }
                num_changed_alphas++;
            }
        }
        if (num_changed_alphas == 0) {
            is_converged = true;
        }
    }
    SVMModel model;
    model.b = b;
    for (int i = 0; i < n; i++) {
        if (alphas[i] > TOLERANCE) {
            model.alphas.push_back(alphas[i]);
            model.support_vectors.push_back(data[i]);
        }
    }
    return model;
}

int predict(vector<double>& features, SVMModel& model) {
    double sum = 0.0;
    for (int i = 0; i < model.alphas.size(); i++) {
        double k = kernel(model.support_vectors[i].features, features);
        sum += model.alphas[i] * model.support_vectors[i].label * k;
    }
    sum += model.b;
    if (sum > 0.0) {
        return 1;
    }
    else {
        return -1;
    }
}

bool file_contains_string(const string& filename, const string& str) {
    ifstream file(filename);
    if (!file.is_open()) {
        exit(1);
    }
    string line;
    while (getline(file, line)) {
        if (line.find(str) != string::npos) {
            return true;
        }
    }
    return false;
}

int main(int argc, char* argv[]) {

    string input_filename;
    string output_filename;

    if (argc >= 2) {
        input_filename = argv[1];
        output_filename = argv[2];
    }
    else {
        cin >> input_filename;
        cin >> output_filename;
    }

    vector<Data> data = load_data(input_filename);

    SVMModel model = train_svm(data);

    ofstream outfile(output_filename);
    if (!outfile.is_open()) {
        return 1;
    }

    for (int i = 0; i < data.size(); i++) {
        int prediction = predict(data[i].features, model);
        outfile << prediction << endl;
    }

    outfile.close();

    if (file_contains_string(output_filename, "Prediction: 1")) {
        cout << "Find The Virus " << endl;
    }
    else {
    }

    return 0;
}

运行环境 Visual Studio 2022 读取的文件 exe要扫描的文件，data.csv

先不谈data.csv,先谈公式（不想用神经网络，只能用在主动防御，再说也是静态扫描，要是在静态里实现动态扫描要把程序放到虚拟机里运行，还要用libvirt，又要会xml写虚拟机配置文件太麻烦了）

一、公式

1内积的计算：

内积（不是那个）的计算公式为：

在代码中，通过循环遍历两个向量的对应元素，将其相乘并求和，得到内积结果。

2.核函数：

序列最小优化（SMO）算法： SMO算法是一种用于训练SVM模型的算法，它通过迭代优化拉格朗日乘子来最小化目标函数，实现模型参数的求解。在代码实现中，通过实现SMO算法来训练SVM模型，并计算出SVM模型中的偏置项、拉格朗日乘子和支持向量等参数。

二、程序运算过程

->1.文件中加载数据集，并将其存储在一个vector<Data>中.

->2.计算两个向量之间的内积，用于SVM的核函数.

->3.训练SVM模型，使用序列最小优化（SMO）算法来优化模型参数。

->4.对给定的特征向量进行预测，返回预测结果。

三、数据

data.csv怎么写

First先写代码(VS 2022)，直接copy

#include <iostream>
#include <fstream>
#include <vector>

using namespace std;

vector<double> extract_features(const string& filename) {
    vector<double> features;

    ifstream file(filename, ios::binary);
    if (!file) {
        cout << "Failed to open file: " << filename << endl;
        return features;
    }

    // 读取二进制数据
    char byte;
    int count = 0;
    while (file.get(byte)) {
        double feature = static_cast<double>(byte);
        features.push_back(feature);

        count++;
        if (count == 3) {
            break;
        }
    }

    file.close();

    return features;
}

int main() {
    string filename;

    while (true) {
        cout << "Please input the filename, or input 'exit' to quit: ";
        cin >> filename;

        if (filename == "exit") {
            break;
        }

        vector<double> features = extract_features(filename);
        for (const auto& feature : features) {
            cout << feature << " ";
        }
        cout << endl;
    }

    return 0;
}

编译后，直接输入exe(只能是exe)文件路径以免有人听不懂，输入样例

C:\Users\Admin1\source\repos\Project1\x64\Debug\1111.exe

输出（是假设的，别当真）：-40 26 -111(这三个是feature值，不能有任何变动，一比一写在csv上，重中之重记住是Microsoft Excel 逗号分隔值文件其他文件一律报错)

data.csv怎么写？第一行一定要顶格，顶在左上角： Feature1 Feature2 Feature3 Label 好，第一行写完后，顺序不许错，例如Feature1 Feature2 Label Feature3是不行的有人肯定会问Label是啥，Label就是你给程序贴上的标签，1为恶意程序，-1为安全程序

Feature1 Feature2 Feature3就是你之前写的程序生成出来的3个feature值原封不动打在data.csv上

例子：-40,26,-111,1(用记事本打开的第二行内容)

写完后，直接编译，啥都别想，不用第三方库（例如libsvm）为啥不用VS自带的cmath呢

第一个杀毒方式就这样写好了（内存指针老是指到空的未知(null)，搞了两小时，现在是能编译了，但主题内容一个标点符号都不能改，一改就出错）

[烧你脑子]SVM模型，懂得都懂（代替360QVM，打破自制杀软旧维度）

状态

开发

支持

[烧你脑子]SVM模型，懂得都懂（代替360QVM，打破自制杀软旧维度）

状态

开发

支持

还没有账户？

登录