我在使用ML.NET的二元分类算法处理威斯康星乳腺癌数据。在训练模型后,我发现每个实例都被评估为假。我的测试文件中有100个实例,其中75个为负例,25个为正例。因此,从指标来看,准确率为0.75,负例精确度为0.75。这意味着所有实例都被评估为0(假)。
private static string trainingDataPath = Path.Combine(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "uploads"), "data.csv"); private static string testDataPath = Path.Combine(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "uploads"), "test.csv"); public bool checkDiagnostic (BreastCancerData input) { // 创建一个新的机器学习上下文 var mlContext = new MLContext(); // 加载训练和测试数据 var trainingDataView = mlContext.Data.LoadFromTextFile<BreastCancerData>(trainingDataPath, hasHeader: false, separatorChar: ','); var testDataView = mlContext.Data.LoadFromTextFile<BreastCancerData>(testDataPath, hasHeader: false, separatorChar: ','); // 预览数据。 //var dataPreview = trainingDataView.Preview(maxRows:700); //var dataPreview2 = testDataView.Preview(); // 训练代码的其余部分在这里... var trainer = mlContext.BinaryClassification.Trainers.LinearSvm("Label", "Features"); var trainingPipeline = mlContext.Transforms.Concatenate(outputColumnName: "Features", nameof(BreastCancerData.AreaMean), nameof(BreastCancerData.AreaSe), nameof(BreastCancerData.AreaWorst), nameof(BreastCancerData.CompactnessMean), nameof(BreastCancerData.CompactnessSe), nameof(BreastCancerData.CompactnessWorst), nameof(BreastCancerData.ConcavePointsMean), nameof(BreastCancerData.ConcavePointsSe), nameof(BreastCancerData.ConcavePointsWorst), nameof(BreastCancerData.ConcavityMean), nameof(BreastCancerData.ConcavitySe), nameof(BreastCancerData.ConcavityWorst), nameof(BreastCancerData.FractalDimensionMean), nameof(BreastCancerData.FractalDimensionSe), nameof(BreastCancerData.FractalDimensionWorst), nameof(BreastCancerData.Id), nameof(BreastCancerData.PerimeterMean), nameof(BreastCancerData.PerimeterSe), nameof(BreastCancerData.PerimeterWorst), nameof(BreastCancerData.RadiusMean), nameof(BreastCancerData.RadiusSe), nameof(BreastCancerData.SmoothnessMean), nameof(BreastCancerData.SmoothnessSe), nameof(BreastCancerData.SmoothnessWorst), nameof(BreastCancerData.SymmetryMean), nameof(BreastCancerData.SymmetrySe), nameof(BreastCancerData.SymmetryWorst), nameof(BreastCancerData.TextureMean), nameof(BreastCancerData.TextureSe), nameof(BreastCancerData.TextureWorst)) .Append(mlContext.Transforms.CopyColumns(outputColumnName: "Label", inputColumnName: nameof(BreastCancerData.Diagnosis))) .Append(trainer); // 预览训练和转换的结果。 var transformationPreview = trainingPipeline.Preview(trainingDataView, maxRows: 700); try { var model = trainingPipeline.Fit(trainingDataView); using (var file = File.OpenWrite(Path.Combine(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "uploads"), "model.ZIP"))) mlContext.Model.Save(model, trainingDataView.Schema, file); ITransformer trainedModel; using (var stream = File.OpenRead(Path.Combine(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "uploads"), "model.ZIP"))) { trainedModel = mlContext.Model.Load(stream, out var modelInputSchema); } var predictionEngine = mlContext.Model.CreatePredictionEngine<BreastCancerData, BreastCancerPrediction>(trainedModel); Console.WriteLine("** 测试产品1 **"); // 预测 BreastCancerPrediction prediction = predictionEngine.Predict(input); Console.WriteLine($"产品: {input.Id} - 诊断结果: {prediction.Prediction}"); BinaryClassificationMetrics metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(model.Transform(testDataView), "Label"); Console.WriteLine($"准确率: {metrics.Accuracy:P2}"); Console.WriteLine($"负例精确度: {metrics.NegativePrecision:P2}"); Console.WriteLine($"正例精确度: {metrics.PositivePrecision:P2}"); Console.WriteLine($"正例召回率: {metrics.PositiveRecall:P2}"); Console.WriteLine($"负例召回率: {metrics.NegativeRecall:P2}"); Console.WriteLine($"ROC曲线下面积: {metrics.AreaUnderRocCurve:P2}"); return prediction.Prediction; } catch (Exception e) { } return false;
编辑1:
public class BreastCancerData { [LoadColumn(0)] public float Id { get; set; } [LoadColumn(1)] public bool Diagnosis { get; set; } [LoadColumn(2)] public float RadiusMean { get; set; } [LoadColumn(3)] public float TextureMean { get; set; } [LoadColumn(4)] public float PerimeterMean { get; set; } [LoadColumn(5)] public float AreaMean { get; set; } [LoadColumn(6)] public float SmoothnessMean { get; set; } [LoadColumn(7)] public float CompactnessMean { get; set; } [LoadColumn(8)] public float ConcavityMean { get; set; } [LoadColumn(9)] public float ConcavePointsMean { get; set; } [LoadColumn(10)] public float SymmetryMean { get; set; } [LoadColumn(11)] public float FractalDimensionMean { get; set; } [LoadColumn(12)] public float RadiusSe { get; set; } [LoadColumn(13)] public float TextureSe { get; set; } [LoadColumn(14)] public float PerimeterSe { get; set; } [LoadColumn(15)] public float AreaSe { get; set; } [LoadColumn(16)] public float SmoothnessSe { get; set; } [LoadColumn(17)] public float CompactnessSe { get; set; } [LoadColumn(18)] public float ConcavitySe { get; set; } [LoadColumn(19)] public float ConcavePointsSe { get; set; } [LoadColumn(20)] public float SymmetrySe { get; set; } [LoadColumn(21)] public float FractalDimensionSe { get; set; } [LoadColumn(22)] public float RadiusWorst { get; set; } [LoadColumn(23)] public float TextureWorst { get; set; } [LoadColumn(24)] public float PerimeterWorst { get; set; } [LoadColumn(25)] public float AreaWorst { get; set; } [LoadColumn(26)] public float SmoothnessWorst { get; set; } [LoadColumn(27)] public float CompactnessWorst { get; set; } [LoadColumn(28)] public float ConcavityWorst { get; set; } [LoadColumn(29)] public float ConcavePointsWorst { get; set; } [LoadColumn(30)] public float SymmetryWorst { get; set; } [LoadColumn(31)] public float FractalDimensionWorst { get; set; } } public class BreastCancerPrediction : BreastCancerData { [ColumnName("PredictedLabel")] public bool Prediction { get; set; } }
请不要在意我使用了如此多的属性而不是向量。
回答:
好的,我通过删除Id
属性解决了这个问题。现在,使用SDCA算法的准确率达到了97.x%。