- 日报
[烧你脑子]SVM模型,懂得都懂(代替360QVM,打破自制杀软旧维度)
- 2024-1-13 21:32:19 @
现在的自制杀软最高的也只是启发式扫描
代码(烧了两个星期的脑子):
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <sstream>
#include <cmath>
#include <cstdlib>
using namespace std;
const int MAX_FEATURES = 1000;
const double C = 1.0;
const double TOLERANCE = 0.001;
struct Data {
vector<double> features;
int label;
};
struct SVMModel {
double b;
vector<double> alphas;
vector<Data> support_vectors;
};
vector<Data> load_data(string filename) {
vector<Data> data;
ifstream file(filename);
string line;
while (getline(file, line)) {
bool has_duplicate = false;
Data d;
stringstream ss(line);
string feature;
while (getline(ss, feature, ',')) {
d.features.push_back(stod(feature));
}
// Check if the features are duplicates
for (int i = 0; i < data.size(); i++) {
bool is_duplicate = true;
for (int j = 0; j < d.features.size(); j++) {
if (data[i].features[j] != d.features[j]) {
is_duplicate = false;
break;
}
}
if (is_duplicate) {
has_duplicate = true;
break;
}
}
if (has_duplicate) {
continue; // Skip this data point
}
istringstream label_stream(feature);
int label;
label_stream >> label;
d.label = label;
data.push_back(d);
}
return data;
}
double kernel(vector<double>& x1, vector<double>& x2) {
double dot_product = 0.0;
for (int i = 0; i < x1.size(); i++) {
dot_product += x1[i] * x2[i];
}
return dot_product;
}
SVMModel train_svm(vector<Data>& data) {
int n = data.size();
vector<double> alphas(n, 0.0);
double b = 0.0;
bool is_converged = false;
while (!is_converged) {
int num_changed_alphas = 0;
for (int i = 0; i < n; i++) {
double Ei = 0.0;
for (int j = 0; j < n; j++) {
Ei += alphas[j] * data[j].label * kernel(data[i].features, data[j].features);
}
Ei = Ei - (double)data[i].label;
if ((data[i].label * Ei < -TOLERANCE && alphas[i] < C) ||
(data[i].label * Ei > TOLERANCE && alphas[i] > 0)) {
int j = i;
while (j == i) {
j = rand() % n;
}
double Ej = 0.0;
for (int k = 0; k < n; k++) {
Ej += alphas[k] * data[k].label * kernel(data[j].features, data[k].features);
}
Ej = Ej - (double)data[j].label;
double alpha_i_old = alphas[i];
double alpha_j_old = alphas[j];
double L, H;
if (data[i].label != data[j].label) {
L = max(0.0, alpha_j_old - alpha_i_old);
H = min(C, C + alpha_j_old - alpha_i_old);
}
else {
L = max(0.0, alpha_i_old + alpha_j_old - C);
H = min(C, alpha_i_old + alpha_j_old);
}
if (L == H) {
continue;
}
double eta = 2.0 * kernel(data[i].features, data[j].features) -
kernel(data[i].features, data[i].features) -
kernel(data[j].features, data[j].features);
if (eta >= 0.0) {
continue;
}
alphas[j] = alpha_j_old - data[j].label * (Ei - Ej) / eta;
if (alphas[j] > H) {
alphas[j] = H;
}
else if (alphas[j] < L) {
alphas[j] = L;
}
if (abs(alphas[j] - alpha_j_old) < TOLERANCE) {
continue;
}
alphas[i] = alpha_i_old + data[i].label * data[j].label *
(alpha_j_old - alphas[j]);
double b1 = b - Ei - data[i].label * (alphas[i] - alpha_i_old) *
kernel(data[i].features, data[i].features) -
data[j].label * (alphas[j] - alpha_j_old) *
kernel(data[i].features, data[j].features);
double b2 = b - Ej - data[i].label * (alphas[i] - alpha_i_old) *
kernel(data[i].features, data[j].features) -
data[j].label * (alphas[j] - alpha_j_old) *
kernel(data[j].features, data[j].features);
if (0.0 < alphas[i] && alphas[i] < C) {
b = b1;
}
else if (0.0 < alphas[j] && alphas[j] < C) {
b = b2;
}
else {
b = (b1 + b2) / 2.0;
}
num_changed_alphas++;
}
}
if (num_changed_alphas == 0) {
is_converged = true;
}
}
SVMModel model;
model.b = b;
for (int i = 0; i < n; i++) {
if (alphas[i] > TOLERANCE) {
model.alphas.push_back(alphas[i]);
model.support_vectors.push_back(data[i]);
}
}
return model;
}
int predict(vector<double>& features, SVMModel& model) {
double sum = 0.0;
for (int i = 0; i < model.alphas.size(); i++) {
double k = kernel(model.support_vectors[i].features, features);
sum += model.alphas[i] * model.support_vectors[i].label * k;
}
sum += model.b;
if (sum > 0.0) {
return 1;
}
else {
return -1;
}
}
bool file_contains_string(const string& filename, const string& str) {
ifstream file(filename);
if (!file.is_open()) {
exit(1);
}
string line;
while (getline(file, line)) {
if (line.find(str) != string::npos) {
return true;
}
}
return false;
}
int main(int argc, char* argv[]) {
string input_filename;
string output_filename;
if (argc >= 2) {
input_filename = argv[1];
output_filename = argv[2];
}
else {
cin >> input_filename;
cin >> output_filename;
}
vector<Data> data = load_data(input_filename);
SVMModel model = train_svm(data);
ofstream outfile(output_filename);
if (!outfile.is_open()) {
return 1;
}
for (int i = 0; i < data.size(); i++) {
int prediction = predict(data[i].features, model);
outfile << prediction << endl;
}
outfile.close();
if (file_contains_string(output_filename, "Prediction: 1")) {
cout << "Find The Virus " << endl;
}
else {
}
return 0;
}
运行环境 Visual Studio 2022 读取的文件 exe要扫描的文件,data.csv
先不谈data.csv,先谈公式(不想用神经网络,只能用在主动防御,再说也是静态扫描,要是在静态里实现动态扫描要把程序放到虚拟机里运行,还要用libvirt,又要会xml写虚拟机配置文件太麻烦了)
一、公式
1内积的计算:
内积(不是那个)的计算公式为:
在代码中,通过循环遍历两个向量的对应元素,将其相乘并求和,得到内积结果。
2.核函数:
- 序列最小优化(SMO)算法: SMO算法是一种用于训练SVM模型的算法,它通过迭代优化拉格朗日乘子来最小化目标函数,实现模型参数的求解。在代码实现中,通过实现SMO算法来训练SVM模型,并计算出SVM模型中的偏置项、拉格朗日乘子和支持向量等参数。
二 、程序运算过程
->1.文件中加载数据集,并将其存储在一个vector<Data>
中.
->2.计算两个向量之间的内积,用于SVM的核函数.
->3.训练SVM模型,使用序列最小优化(SMO)算法来优化模型参数。
->4.对给定的特征向量进行预测,返回预测结果。
三、数据
data.csv怎么写
First先写代码(VS 2022),直接copy
#include <iostream>
#include <fstream>
#include <vector>
using namespace std;
vector<double> extract_features(const string& filename) {
vector<double> features;
ifstream file(filename, ios::binary);
if (!file) {
cout << "Failed to open file: " << filename << endl;
return features;
}
// 读取二进制数据
char byte;
int count = 0;
while (file.get(byte)) {
double feature = static_cast<double>(byte);
features.push_back(feature);
count++;
if (count == 3) {
break;
}
}
file.close();
return features;
}
int main() {
string filename;
while (true) {
cout << "Please input the filename, or input 'exit' to quit: ";
cin >> filename;
if (filename == "exit") {
break;
}
vector<double> features = extract_features(filename);
for (const auto& feature : features) {
cout << feature << " ";
}
cout << endl;
}
return 0;
}
编译后,直接输入exe(只能是exe)文件路径 以免有人听不懂,输入样例
C:\Users\Admin1\source\repos\Project1\x64\Debug\1111.exe
输出(是假设的,别当真):-40 26 -111(这三个是feature值,不能有任何变动,一比一写在csv上,重中之重记住是Microsoft Excel 逗号分隔值文件其他文件一律报错)
data.csv怎么写? 第一行一定要顶格,顶在左上角: Feature1 Feature2 Feature3 Label 好,第一行写完后,顺序不许错,例如Feature1 Feature2 Label Feature3是不行的 有人肯定会问Label是啥,Label就是你给程序贴上的标签,1为恶意程序,-1为安全程序
Feature1 Feature2 Feature3就是你之前写的程序生成出来的3个feature值原封不动打在data.csv上
例子:-40,26,-111,1(用记事本打开的第二行内容)
写完后,直接编译,啥都别想,不用第三方库(例如libsvm)为啥不用VS自带的cmath呢
第一个杀毒方式就这样写好了(内存指针老是指到空的未知(null),搞了两小时,现在是能编译了,但主题内容一个标点符号都不能改,一改就出错)