我是机器学习的新手,正在尝试自己实现第一个朴素贝叶斯分类器,以便更好地理解它。我使用了来自http://archive.ics.uci.edu/ml/datasets/Adult的数据集(美国人口普查数据,类别为'<=50k’和’>50k’)。
这是我的Python代码:
#!/usr/bin/pythonimport sysimport csvwords_stats = {} # {'word': {'class1': cnt, 'class2': cnt'}}words_cnt = 0targets_stats = {} # {'class1': 3234, 'class2': 884} how many words in each classclass_stats = {} # {'class1': 7896, 'class2': 3034} how many lines in each classitems_cnt = 0def train(dataset, targets): global words_stats, words_cnt, targets_stats, items_cnt, class_stats num = len(dataset) for item in xrange(num): class_stats[targets[item]] = class_stats.get(targets[item], 0) + 1 for i in xrange(len(dataset[item])): word = dataset[item][i] if not words_stats.has_key(word): words_stats[word] = {} tgt = targets[item] cnt = words_stats[word].get(tgt, 0) words_stats[word][tgt] = cnt + 1 targets_stats[tgt] = targets_stats.get(tgt, 0) + 1 words_cnt += 1 items_cnt = numdef classify(doc, tgt_set): global words_stats, words_cnt, targets_stats, items_cnt probs = {} #the probability itself P(c|W) = P(W|c) * P(c) / P(W) pc = {} #probability of the class in document set P(c) pwc = {} #probability of the word set in particular class. P(W|c) pw = 1 #probability of the word set in documet set for word in doc: if word not in words_stats: continue #dirty, very dirty pw = pw * float(sum(words_stats[word].values())) / words_cnt for tgt in tgt_set: pc[tgt] = class_stats[tgt] / float(items_cnt) for word in doc: if word not in words_stats: continue #dirty, very dirty tgt_wrd_cnt = words_stats[word].get(tgt, 0) pwc[tgt] = pwc.get(tgt, 1) * float(tgt_wrd_cnt) / targets_stats[tgt] probs[tgt] = (pwc[tgt] * pc[tgt]) / pw l = sorted(probs.items(), key = lambda i: i[1], reverse=True) print probs return l[0][0]def check_results(dataset, targets): num = len(dataset) tgt_set = set(targets) correct = 0 incorrect = 0 for item in xrange(num): res = classify(dataset[item], tgt_set) if res == targets[item]: correct = correct + 1 else: incorrect = incorrect + 1 print 'correct:', float(correct) / num, ' incorrect:', float(incorrect) / numdef load_data(fil): data = [] tgts = [] reader = csv.reader(fil) for line in reader: d = [x.strip() for x in line] if '?' in d: continue if not len(d): continue data.append(d[:-1]) tgts.append(d[-1:][0]) return data, tgtsif __name__ == '__main__': if len(sys.argv) < 3: print './program train_data.txt test_data.txt' sys.exit(1) filename = sys.argv[1] fil = open(filename, 'r') data, tgt = load_data(fil) train(data, tgt) test_file = open(sys.argv[2], 'r') test_data, test_tgt = load_data(test_file) check_results(test_data, tgt)
它给出的正确结果约为61%。当我打印概率时,我得到以下结果:
{'<=50K': 0.07371606889800396, '>50K': 15.325378327213354}
但在正确分类器的情况下,我期望看到两个概率的总和等于1。起初我认为问题出在浮点数下溢,并尝试在对数中进行所有计算,但结果相似。我明白忽略一些词会影响准确性,但这些概率实在是太不对了。
我哪里做错了或有什么不理解的?
为了方便您,我已经上传了数据集和Python脚本到这里:https://dl.dropboxusercontent.com/u/36180992/adult.tar.gz
感谢您的帮助。
回答:
朴素贝叶斯分类器并不直接计算概率,而是计算一个“原始分数”,通过与每个标签的其他分数进行相对比较来对实例进行分类。这个分数可以很容易地转换为[0, 1]
范围内的“概率”:
total = sum(probs.itervalues())for label, score in probs.iteritems(): probs[label] = score / total
然而,请记住,这仍然不代表真正的概率,正如这个回答中提到的:
朴素贝叶斯倾向于预测的概率几乎总是非常接近零或非常接近一。