Python中怎么实现knn算法

发布时间：2021-08-07 16:07:26 作者：Leah
来源：亿速云阅读：258

本篇文章给大家分享的是有关Python中怎么实现knn算法，小编觉得挺实用的，因此分享给大家学习，希望大家阅读完这篇文章后可以有所收获，话不多说，跟着小编一起来看看吧。

　　一、题目名称

　　实现knn分类算法

　　二、题目内容

　　原生Python实现knn分类算法，并使用鸢尾花数据集进行测试

　　三、算法分析

　　knn算法是最简单的机器学习算法之一，通过测量不同特征值之间的距离进行分类。其基本思路是：如果一个样本在特征空间中的k个最相似(即特征空间中最近邻)的样本中的大多数属于某一个类别，则该样本也属于这个类别。

　　本次作业主要模拟实现了knn测试数据与训练数据之间的距离求解、排序、最邻近k个元素的筛选。其中，空间距离采用“欧式距离”进行计算，表达式如下：

　　上式中dist[i] 为测试数据与下标为i的训练数据的距离，xt,xi 分别为测试数据和下标为i的训练数据，算法整体流程图如下：

　　图 1 knn算法流程图

　　四、调试截图

　　调试过程主要的任务是观察数据结构：Python中的嵌套结构较为复杂，需要清楚每一步输出结果的维度和具体数据结构

　　五、运行结果

　　本次作业中的输入为鸢尾花数据集，输出为预测后的鸢尾花类型。最初设想采用散点图输出，但原生Python散点图效果较差，故改为直接字符串输出，输出类别即可，得出运行结果

　　图 4 原生Python散点图效果较差

　　图 5 改为直接字符串输出类别

　　六、问题及解决

　　实现过程中遇到的主要问题是数据结构的混淆。在knn实现类中，经过多次列表生成、嵌套，容易造成对数据结构的混淆，从而出现下标维数错误等错误，解决办法也很简单，debug查看数据结构或者直接print输出每步内容观察。

　　图 6 下标错误

　　七、源代码

　　1.knn.py

　　# !/usr/bin/env python

　　# -*- encoding: utf-8 -*-

　　# @Project : machinelearning

　　# @File : knn.py

　　# @Author : yanchengxu

　　# @Contact : yanchengxu1214@outlook.com

　　# @Time : 2019/10/7 16:14

　　# @IDE : PyCharm

　　import numpy as np

　　import math

　　class KNNClassifier:

　　"""

　　KNN

　　"""

　　def __init__(self, k=3):

　　"""

　　初始化

　　X_train 特征测试集

　　y_train 标量测试集

　　res_class 预测结果

　　:param k: 默认值为3

　　"""

　　self.k = k

　　self.X_train = []

　　self.y_train = []

　　self.result = []

　　def fit(self, X_train, y_train):

　　"""

　　KNN 训练模型

　　:param X_train: 训练集特征数据

　　:param y_train: 训练集目标数据

　　:return: self

　　"""

　　assert X_train.shape[0] == y_train.shape[0], '训练集特征与目标值个数不匹配'

　　assert self.k <= X_train.shape[0], 'K值超出训练数据范围'

　　self.X_train = X_train

　　self.y_train = y_train

　　# print('K', self.k)

　　# print('X.shape', self.X_train.shape)

　　# print('y.shape', self.y_train.shape)

　　def get_distance(self, x_test):

　　"""

　　计算距离

　　:param x_test: 测试集

　　:return: list_dist

　　"""

　　list_dist = []

　　for i in range(len(x_test)):

　　# x_train 是 X_train 中的每个坐标，只有一个维度

　　list_dist.append(

　　[math.sqrt(np.sum(x_train[0] - x_test[i][0]) ** 2 + np.sum(x_train[1] - x_test[i][1]) ** 2) for x_train

　　in self.X_train])

　　# print('len of list_dist =', len(list_dist[0]))

　　return list_dist

　　def get_k_nearest_dist(self, list_dist):

　　"""

　　对距离进行排序

　　:param list_dist: 测试点距离样本的距离

　　:return: list_k_nearest_dist

　　"""

　　k = self.k

　　list_each_dist = []

　　for i in range(len(list_dist)):

　　dict_temp = {}

　　for j in range(len(list_dist[i])):

　　dict_temp[j] = list_dist[i][j]

　　list_each_dist.append(dict_temp)

　　# print('list_each_dist:', list_each_dist)

　　# print('len of count_mix:', len(list_each_dist))

　　list_k_nearest_dist = []

　　for i in range(len(list_each_dist)):

　　# 键值对排序

　　dict_sorted_dist = dict(sorted(list_each_dist[i].items(), key=lambda x: x[1], reverse=False))

　　# print('dict_sorted_dist', dict_sorted_dist)

　　top = 0

　　dict_knearest_distance = {}

　　for key in dict_sorted_dist:

　　dict_knearest_distance[key] = dict_sorted_dist[key]

　　top += 1

　　if top == self.k:

　　break

　　list_k_nearest_dist.append(dict_knearest_distance)

　　# print('list_k_nearest_dist:', list_k_nearest_dist)

　　# 注意缩进!!!

　　return list_k_nearest_dist

　　def vote(self, k_nearest_dist):

　　"""

　　投票

　　:param k_nearest_dist: k个最近距离

　　:return: self

　　"""

　　# 所有测试点的topK个标签

　　list_all_test = []

　　for i in range(len(k_nearest_dist)):

　　# 每个测试点的topK个标签

　　list_each_test = []

　　for key in k_nearest_dist[i]:

　　# 数据结构

　　list_each_test.append(self.y_train[key])

　　# list_each_test.append(self.y_train[key][0])

　　list_all_test.append(list_each_test)

　　# print('list_class2', list_each_test)

　　# print('list_all_test：', list_all_test)

　　# 利用set去重->优化速度

　　set_list_class = []

　　for i in range(len(list_all_test)):

　　set_list_class.append(set(list_all_test[i]))

　　# print('set_list_class', set_list_class)

　　for i in range(len(set_list_class)):

　　dict_count = {}

　　for item in set_list_class[i]:

　　dict_count.update({item: list_all_test[i].count(item)})

　　# print('dict_count', dict_count)

　　# 获得字典dict_count中value最大值对应的key，即为每个点的分类结果

　　each_result = max(dict_count, key=dict_count.get)

　　# print('each_result', each_result)

　　self.result.append(each_result)

　　# print('result:', self.result)

　　return self.result

　　def predict(self, X_predict):

　　"""

　　预测

　　:param X_predict: 待测集

　　:return: self

　　"""

　　assert X_predict.shape[1] == self.X_train.shape[1], '特征数不匹配'

　　# 获取待测点与标准点的距离

　　distances = self.get_distance(X_predict)

　　# print("distances：", distances)

　　# 获取k个最近距离

　　k_nearest_dist = self.get_k_nearest_dist(distances)

　　# print("k_nearest_dist：", k_nearest_dist)

　　# 投票

　　result = self.vote(k_nearest_dist)

　　return result

　　2.test.py

　　# !/usr/bin/env python

　　# -*- encoding: utf-8 -*-

　　# @Project : machinelearning

　　# @File : test.py

　　# @Author : yanchengxu

　　# @Contact : yanchengxu1214@outlook.com

　　# @Time : 2019/10/7 16:57

　　# @IDE : PyCharm

　　from sklearn.datasets import load_iris

　　from sklearn.model_selection import train_test_split

　　from myknn.knn import KNNClassifier

　　import numpy as np

　　import matplotlib.pyplot as plt

　　# import itertools

　　# import random

　　kn = KNNClassifier(3)

　　# 训练数据

　　# X = [[1, 1], [1, 2], [1, 3], [2, 1], [2, 2], [2, 3], [3, 1], [3, 2], [3, 3],

　　# [6, 6], [6, 7], [6, 8], [7, 6], [7, 7], [7, 8], [8, 6], [8, 7], [8, 8],

　　# [11, 1], [11, 2], [11, 3], [12, 1], [12, 2], [12, 3], [13, 1], [13, 2], [13, 3]]

　　#无锡做人流多少钱 http://www.xasgyy.net/

　　# Y = [['A'], ['A'], ['A'], ['A'], ['A'], ['A'], ['A'], ['A'], ['A'],

　　# ['B'], ['B'], ['B'], ['B'], ['B'], ['B'], ['B'], ['B'], ['B'],

　　# ['C'], ['C'], ['C'], ['C'], ['C'], ['C'], ['C'], ['C'], ['C']]

　　# # 随机

　　# random_list = list(itertools.product(range(1, 13), range(1, 8)))

　　# X = random.sample(random_list, len(Y))

　　# # print('random_list', X)

　　# print('shape y:', y_train.shape)

　　iris_dataset = load_iris()

　　# test

　　# print(iris_dataset)

　　X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], random_state=0)

　　X = np.asarray(X_train)

　　Y = np.asarray(y_train)

　　# print('X:', X)

　　# print('Y,shape', Y.shape)

　　# print('Y.type', type(Y))

　　# 模型训练

　　kn.fit(X, Y)

　　# 数据预测

　　x_test = [[5, 2.9, 1, 0.2], [6.7, 3.2, 5.2, 2.3], [5.6, 3.1, 4.5, 1.5]]

　　X_test = np.asarray(x_test)

　　prediction = kn.predict(X_test)

　　# 打印预测结果

　　for i in range(len(prediction)):

　　print(x_test[i], '->', iris_dataset['target_names'][prediction[i]])

　　# # 散点图观察

　　# x1 = []

　　# y1 = []

　　# # 训练集

　　# for i in np.asarray(X):

　　# x1.append(i[0])

　　# y1.append(i[1])

　　# x2 = []

　　# y2 = []

　　# # 测试集

　　# for i in np.asarray(x_test):

　　# x2.append(i[0])

　　# y2.append(i[1])

　　# plt.plot(x1, y1, 'r*')

　　# plt.plot(x2, y2, 'g+')

　　# plt.show()

以上就是Python中怎么实现knn算法，小编相信有部分知识点可能是我们日常工作会见到或用到的。希望你能通过这篇文章学到更多知识。更多详情敬请关注亿速云行业资讯频道。

Python中怎么实现knn算法

相关阅读