Go语言机器学习实战:构建分类模型
Go语言机器学习实战构建分类模型引言机器学习是人工智能的核心领域之一它使计算机能够从数据中学习并做出预测。Go语言以其高性能和并发能力成为构建机器学习应用的理想选择。本文将介绍如何使用Go语言构建机器学习分类模型。一、机器学习基础概念1.1 监督学习与无监督学习// 监督学习有标签数据 // 无监督学习无标签数据聚类分析 // 分类问题预测离散值如猫/狗 // 回归问题预测连续值如房价1.2 评估指标// 准确率正确预测的比例 // 精确率预测为正例的样本中真正为正例的比例 // 召回率真正为正例的样本中被预测为正例的比例 // F1分数精确率和召回率的调和平均二、决策树算法2.1 实现决策树package main import ( fmt math ) type DecisionTreeNode struct { feature int threshold float64 left *DecisionTreeNode right *DecisionTreeNode class int isLeaf bool } func giniImpurity(labels []int) float64 { counts : make(map[int]int) for _, label : range labels { counts[label] } impurity : 1.0 total : float64(len(labels)) for _, count : range counts { prob : float64(count) / total impurity - prob * prob } return impurity } func splitData(data [][]float64, labels []int, feature int, threshold float64) ([][]float64, []int, [][]float64, []int) { var leftData, rightData [][]float64 var leftLabels, rightLabels []int for i, row : range data { if row[feature] threshold { leftData append(leftData, row) leftLabels append(leftLabels, labels[i]) } else { rightData append(rightData, row) rightLabels append(rightLabels, labels[i]) } } return leftData, leftLabels, rightData, rightLabels } func findBestSplit(data [][]float64, labels []int) (int, float64, float64) { bestFeature : -1 bestThreshold : 0.0 bestGain : 0.0 numFeatures : len(data[0]) currentImpurity : giniImpurity(labels) for feature : 0; feature numFeatures; feature { // 获取该特征的所有值 values : make(map[float64]bool) for _, row : range data { values[row[feature]] true } for value : range values { leftData, leftLabels, rightData, rightLabels : splitData(data, labels, feature, value) if len(leftLabels) 0 || len(rightLabels) 0 { continue } // 计算信息增益 leftWeight : float64(len(leftLabels)) / float64(len(labels)) rightWeight : float64(len(rightLabels)) / float64(len(labels)) gain : currentImpurity - leftWeight*giniImpurity(leftLabels) - rightWeight*giniImpurity(rightLabels) if gain bestGain { bestGain gain bestFeature feature bestThreshold value } } } return bestFeature, bestThreshold, bestGain } func buildTree(data [][]float64, labels []int, depth int, maxDepth int) *DecisionTreeNode { // 终止条件 if depth maxDepth || len(labels) 0 { // 返回最常见的类别 counts : make(map[int]int) for _, label : range labels { counts[label] } maxCount : 0 bestClass : 0 for class, count : range counts { if count maxCount { maxCount count bestClass class } } return DecisionTreeNode{ class: bestClass, isLeaf: true, } } feature, threshold, gain : findBestSplit(data, labels) if gain 0 { // 返回最常见的类别 counts : make(map[int]int) for _, label : range labels { counts[label] } maxCount : 0 bestClass : 0 for class, count : range counts { if count maxCount { maxCount count bestClass class } } return DecisionTreeNode{ class: bestClass, isLeaf: true, } } leftData, leftLabels, rightData, rightLabels : splitData(data, labels, feature, threshold) node : DecisionTreeNode{ feature: feature, threshold: threshold, isLeaf: false, } node.left buildTree(leftData, leftLabels, depth1, maxDepth) node.right buildTree(rightData, rightLabels, depth1, maxDepth) return node } func (node *DecisionTreeNode) Predict(row []float64) int { if node.isLeaf { return node.class } if row[node.feature] node.threshold { return node.left.Predict(row) } return node.right.Predict(row) } func main() { // 示例数据特征为[年龄, 收入]标签为是否购买(0/1) data : [][]float64{ {25, 40000}, {35, 60000}, {45, 80000}, {20, 20000}, {30, 50000}, {40, 70000}, {50, 90000}, {22, 30000}, } labels : []int{0, 1, 1, 0, 0, 1, 1, 0} tree : buildTree(data, labels, 0, 3) // 预测 testData : []float64{32, 55000} prediction : tree.Predict(testData) fmt.Printf(预测结果: %d\n, prediction) }三、随机森林3.1 实现随机森林package main import ( fmt math/rand time ) type RandomForest struct { trees []*DecisionTreeNode numTrees int maxDepth int } func NewRandomForest(numTrees, maxDepth int) *RandomForest { rand.Seed(time.Now().UnixNano()) return RandomForest{ numTrees: numTrees, maxDepth: maxDepth, } } func (rf *RandomForest) Fit(data [][]float64, labels []int) { for i : 0; i rf.numTrees; i { // 随机采样有放回 sampleData, sampleLabels : bootstrapSample(data, labels) tree : buildTree(sampleData, sampleLabels, 0, rf.maxDepth) rf.trees append(rf.trees, tree) } } func bootstrapSample(data [][]float64, labels []int) ([][]float64, []int) { n : len(data) sampleData : make([][]float64, n) sampleLabels : make([]int, n) for i : 0; i n; i { idx : rand.Intn(n) sampleData[i] data[idx] sampleLabels[i] labels[idx] } return sampleData, sampleLabels } func (rf *RandomForest) Predict(row []float64) int { votes : make(map[int]int) for _, tree : range rf.trees { prediction : tree.Predict(row) votes[prediction] } maxVotes : 0 bestClass : 0 for class, votes : range votes { if votes maxVotes { maxVotes votes bestClass class } } return bestClass } func main() { data : [][]float64{ {25, 40000}, {35, 60000}, {45, 80000}, {20, 20000}, {30, 50000}, {40, 70000}, {50, 90000}, {22, 30000}, } labels : []int{0, 1, 1, 0, 0, 1, 1, 0} rf : NewRandomForest(10, 3) rf.Fit(data, labels) testData : []float64{32, 55000} prediction : rf.Predict(testData) fmt.Printf(随机森林预测结果: %d\n, prediction) }四、支持向量机4.1 SVM基础package main import ( fmt math ) type SVM struct { weights []float64 bias float64 lr float64 } func NewSVM(featureCount int, lr float64) *SVM { return SVM{ weights: make([]float64, featureCount), bias: 0, lr: lr, } } func (svm *SVM) trainOne(data []float64, label int) { // 预测 prediction : svm.predictRaw(data) // 如果分类正确不更新 if label*prediction 1 { return } // 更新权重和偏置 for i : range svm.weights { svm.weights[i] svm.lr * float64(label) * data[i] } svm.bias svm.lr * float64(label) } func (svm *SVM) predictRaw(data []float64) float64 { var result float64 for i, w : range svm.weights { result w * data[i] } result svm.bias return result } func (svm *SVM) Predict(data []float64) int { result : svm.predictRaw(data) if result 0 { return 1 } return -1 } func main() { // 线性可分数据 data : [][]float64{ {1, 2}, {2, 3}, {3, 3}, {2, 1}, {3, 2}, {4, 1}, } labels : []int{1, 1, 1, -1, -1, -1} svm : NewSVM(2, 0.1) // 训练 for epoch : 0; epoch 100; epoch { for i, row : range data { svm.trainOne(row, labels[i]) } } // 预测 testData : []float64{2.5, 2.5} prediction : svm.Predict(testData) fmt.Printf(SVM预测结果: %d\n, prediction) }五、K近邻算法5.1 KNN实现package main import ( fmt math sort ) type KNN struct { k int } func NewKNN(k int) *KNN { return KNN{k: k} } func euclideanDistance(a, b []float64) float64 { var sum float64 for i : range a { sum math.Pow(a[i]-b[i], 2) } return math.Sqrt(sum) } func (knn *KNN) Predict(trainData [][]float64, trainLabels []int, testData []float64) int { // 计算距离 type neighbor struct { distance float64 label int } var neighbors []neighbor for i, row : range trainData { dist : euclideanDistance(row, testData) neighbors append(neighbors, neighbor{distance: dist, label: trainLabels[i]}) } // 按距离排序 sort.Slice(neighbors, func(i, j int) bool { return neighbors[i].distance neighbors[j].distance }) // 取前k个 votes : make(map[int]int) for i : 0; i knn.k; i { votes[neighbors[i].label] } // 投票 maxVotes : 0 bestClass : 0 for class, count : range votes { if count maxVotes { maxVotes count bestClass class } } return bestClass } func main() { data : [][]float64{ {25, 40000}, {35, 60000}, {45, 80000}, {20, 20000}, {30, 50000}, {40, 70000}, {50, 90000}, {22, 30000}, } labels : []int{0, 1, 1, 0, 0, 1, 1, 0} knn : NewKNN(3) testData : []float64{32, 55000} prediction : knn.Predict(data, labels, testData) fmt.Printf(KNN预测结果: %d\n, prediction) }六、模型评估6.1 混淆矩阵package main import ( fmt ) func ConfusionMatrix(trueLabels, predictions []int) ([2][2]int, error) { if len(trueLabels) ! len(predictions) { return [2][2]int{}, fmt.Errorf(长度不匹配) } var matrix [2][2]int for i : range trueLabels { trueLabel : trueLabels[i] predLabel : predictions[i] if trueLabel 0 predLabel 0 { matrix[0][0] // TN } else if trueLabel 0 predLabel 1 { matrix[0][1] // FP } else if trueLabel 1 predLabel 0 { matrix[1][0] // FN } else if trueLabel 1 predLabel 1 { matrix[1][1] // TP } } return matrix, nil } func Accuracy(matrix [2][2]int) float64 { total : matrix[0][0] matrix[0][1] matrix[1][0] matrix[1][1] correct : matrix[0][0] matrix[1][1] return float64(correct) / float64(total) } func Precision(matrix [2][2]int) float64 { predictedPositives : matrix[0][1] matrix[1][1] if predictedPositives 0 { return 0 } return float64(matrix[1][1]) / float64(predictedPositives) } func Recall(matrix [2][2]int) float64 { actualPositives : matrix[1][0] matrix[1][1] if actualPositives 0 { return 0 } return float64(matrix[1][1]) / float64(actualPositives) } func F1Score(matrix [2][2]int) float64 { precision : Precision(matrix) recall : Recall(matrix) if precisionrecall 0 { return 0 } return 2 * precision * recall / (precision recall) } func main() { trueLabels : []int{0, 1, 1, 0, 1, 0, 1, 0} predictions : []int{0, 1, 0, 0, 1, 1, 1, 0} matrix, _ : ConfusionMatrix(trueLabels, predictions) fmt.Printf(混淆矩阵:\n) fmt.Printf([[%d %d]\n, matrix[0][0], matrix[0][1]) fmt.Printf( [%d %d]]\n, matrix[1][0], matrix[1][1]) fmt.Printf(准确率: %.2f\n, Accuracy(matrix)) fmt.Printf(精确率: %.2f\n, Precision(matrix)) fmt.Printf(召回率: %.2f\n, Recall(matrix)) fmt.Printf(F1分数: %.2f\n, F1Score(matrix)) }七、交叉验证7.1 K折交叉验证package main import ( fmt math/rand time ) func kFoldSplit(data [][]float64, labels []int, k int) [][][][]float64 { rand.Seed(time.Now().UnixNano()) // 打乱数据 indices : make([]int, len(data)) for i : range indices { indices[i] i } rand.Shuffle(len(indices), func(i, j int) { indices[i], indices[j] indices[j], indices[i] }) foldSize : len(data) / k folds : make([][][][]float64, k) for i : 0; i k; i { start : i * foldSize end : start foldSize if i k-1 { end len(data) } var trainData, trainLabels [][]float64 var testData, testLabels []int for j : 0; j len(data); j { if j start j end { testData append(testData, data[indices[j]]) testLabels append(testLabels, labels[indices[j]]) } else { trainData append(trainData, data[indices[j]]) trainLabels append(trainLabels, labels[indices[j]]) } } folds[i] [][][]float64{trainData, make([][]float64, len(trainLabels)), testData, make([][]float64, len(testLabels))} // 转换labels为[][]float64以便存储 for idx, label : range trainLabels { folds[i][1] append(folds[i][1], []float64{float64(label)}) } for idx, label : range testLabels { folds[i][3] append(folds[i][3], []float64{float64(label)}) } } return folds } func main() { data : [][]float64{ {1, 2}, {2, 3}, {3, 3}, {2, 1}, {3, 2}, {4, 1}, {1, 1}, {4, 4}, } labels : []int{1, 1, 1, -1, -1, -1, -1, 1} folds : kFoldSplit(data, labels, 4) for i, fold : range folds { fmt.Printf(Fold %d:\n, i1) fmt.Printf( 训练集大小: %d\n, len(fold[0])) fmt.Printf( 测试集大小: %d\n, len(fold[2])) } }八、总结本文介绍了如何使用Go语言构建机器学习分类模型包括决策树基于基尼不纯度的决策树实现随机森林集成多个决策树提高准确性支持向量机基于最大间隔分类的SVM实现K近邻基于距离的分类算法模型评估混淆矩阵和评估指标交叉验证K折交叉验证通过这些实现你可以使用Go语言构建自己的机器学习模型充分利用Go的性能优势。