Coding-机器学习分类器

T检验

对标签为1和-1的样本进行T检验

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import fdrcorrection
def t_test(data, label):
# 将正负样本分开
pos_data = data[label == 1]
neg_data = data[label == -1]

# 对正负样本做独立样本t检验
t_values, p_values = ttest_ind(pos_data, neg_data, axis=0)

# 使用FDR控制方法去除不显著的特征
reject, p_values_corrected = fdrcorrection(p_values, alpha=0.1)
significant_features = np.where(reject)[0]

# 返回显著特征的索引
return data[:,significant_features]

SVM

SVM对莺尾花数据集分类,可以利用此代码进行数据集的初步分类可行性验证

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, KFold,train_test_split
from sklearn.svm import SVC

iris = load_iris()
data = iris.data
labels = iris.target

# 划分训练集和测试集
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.2, random_state=1)

print(train_data.shape)
print(test_data.shape)

# 定义参数范围
x = np.logspace(-4, 4, num=9, base=2)
param_grid = {
"C": x,
}

# 定义模型和交叉验证
model = SVC(kernel="linear")
cv = KFold(n_splits=10, shuffle=True, random_state=1)

# 定义网格参数搜索法
grid_search = GridSearchCV(model, param_grid, cv=cv, scoring="accuracy")

# 进行交叉验证
grid_search.fit(train_data, train_labels)

# 输出最佳参数和交叉验证得分
print("Best parameters: ", grid_search.best_params_)
print("Cross-validation score: ", grid_search.best_score_)

# 在测试集上评估模型
test_score = grid_search.score(test_data, test_labels)
print("Test set score: ", test_score)

朴素贝叶斯

朴素贝叶斯对莺尾花数据集分类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# 加载莺尾花数据集
iris = load_iris()
data = iris.data
labels = iris.target

# 划分训练集和测试集
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.2, random_state=1)

# 创建贝叶斯分类器对象
classifier = GaussianNB()

# 在训练集上训练模型
classifier.fit(train_data, train_labels)

# 在测试集上进行预测
predictions = classifier.predict(test_data)

# 计算分类准确率
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy: ", accuracy)

K近邻

K近邻对莺尾花数据集分类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# 加载莺尾花数据集
iris = load_iris()
data = iris.data
labels = iris.target

# 划分训练集和测试集
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.2, random_state=1)

# 创建KNN分类器对象
classifier = KNeighborsClassifier(n_neighbors=5)

# 在训练集上训练模型
classifier.fit(train_data, train_labels)

# 在测试集上进行预测
predictions = classifier.predict(test_data)

# 计算分类准确率
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy: ", accuracy)