我在寻找如何使用Encog框架创建一个简单的垃圾邮件过滤/分类或聚类应用的示例。谷歌上我没能找到任何相关内容。
我还购买了Jeff Heaton的书,书名是《用C#编程的Encog3神经网络》,但我没能找到任何关于此类应用的示例。
有谁能提供一个简单的应用示例,展示如何根据邮件的主题和正文文本将邮件分类为垃圾邮件吗?
编辑:我已经看过如何在Python中实现的方法,但我问的是,有谁能提供使用Encog + C#的具体示例,展示如何创建一个垃圾邮件过滤/分类应用?
回答:
大多数垃圾邮件过滤器使用某种贝叶斯分类法,最常见的是朴素贝叶斯分类法。以下是一些无需额外框架即可使用的代码。
public void TrainClassifier(DataTable table){dataSet.Tables.Add(table);//tableDataTable GaussianDistribution = dataSet.Tables.Add("Gaussian");GaussianDistribution.Columns.Add(table.Columns[0].ColumnName);//columnsfor (int i = 1; i < table.Columns.Count; i++){ GaussianDistribution.Columns.Add(table.Columns[i].ColumnName + "Mean"); GaussianDistribution.Columns.Add(table.Columns[i].ColumnName + "Variance");}//calc datavar results = (from myRow in table.AsEnumerable() group myRow by myRow.Field<string>(table.Columns[0].ColumnName) into g select new { Name = g.Key, Count = g.Count() }).ToList();for (int j = 0; j < results.Count; j++){ DataRow row = GaussianDistribution.Rows.Add(); row[0] = results[j].Name; int a = 1; for (int i = 1; i < table.Columns.Count; i++) { row[a] = Helper.Mean(SelectRows(table, i, string.Format("{0} = '{1}'", table.Columns[0].ColumnName, results[j].Name))); row[++a] = Helper.Variance(SelectRows(table, i, string.Format("{0} = '{1}'", table.Columns[0].ColumnName, results[j].Name))); a++; }}
}
public string Classify(double[] obj){Dictionary<string,> score = new Dictionary<string,>();var results = (from myRow in dataSet.Tables[0].AsEnumerable() group myRow by myRow.Field<string>( dataSet.Tables[0].Columns[0].ColumnName) into g select new { Name = g.Key, Count = g.Count() }).ToList();for (int i = 0; i < results.Count; i++){ List<double> subScoreList = new List<double>(); int a = 1, b = 1; for (int k = 1; k < dataSet.Tables["Gaussian"].Columns.Count; k = k + 2) { double mean = Convert.ToDouble(dataSet.Tables["Gaussian"].Rows[i][a]); double variance = Convert.ToDouble(dataSet.Tables["Gaussian"].Rows[i][++a]); double result = Helper.NormalDist(obj[b - 1], mean, Helper.SquareRoot(variance)); subScoreList.Add(result); a++; b++; } double finalScore = 0; for (int z = 0; z < subScoreList.Count; z++) { if (finalScore == 0) { finalScore = subScoreList[z]; continue; } finalScore = finalScore * subScoreList[z]; } score.Add(results[i].Name, finalScore * 0.5);}double maxOne = score.Max(c => c.Value);var name = (from c in score where c.Value == maxOne select c.Key).First();return name;}
编辑:这是如何使用它的方法!
DataTable table = new DataTable(); table.Columns.Add("Sex"); table.Columns.Add("Height", typeof(double)); table.Columns.Add("Weight", typeof(double)); table.Columns.Add("FootSize", typeof(double)); //训练数据。 table.Rows.Add("male", 6, 180, 12); table.Rows.Add("male", 5.92, 190, 11); table.Rows.Add("male", 5.58, 170, 12); table.Rows.Add("male", 5.92, 165, 10); table.Rows.Add("female", 5, 100, 6); table.Rows.Add("female", 5.5, 150, 8); table.Rows.Add("female", 5.42, 130, 7); table.Rows.Add("female", 5.75, 150, 9); table.Rows.Add("transgender", 4, 200, 5); table.Rows.Add("transgender", 4.10, 150, 8); table.Rows.Add("transgender", 5.42, 190, 7); table.Rows.Add("transgender", 5.50, 150, 9); Classifier classifier = new Classifier(); classifier.TrainClassifier(table); //输出将是transgender。 Console.WriteLine(classifier.Classify(new double[] { 4, 150, 12 })); Console.Read();