我正在尝试在Python中实现k折交叉验证算法。我知道SKLearn提供了实现,但还是…这是我目前的代码。
from sklearn import metricsimport numpy as npclass Cross_Validation:@staticmethoddef partition(vector, fold, k): size = vector.shape[0] start = (size/k)*fold end = (size/k)*(fold+1) validation = vector[start:end] if str(type(vector)) == "<class 'scipy.sparse.csr.csr_matrix'>": indices = range(start, end) mask = np.ones(vector.shape[0], dtype=bool) mask[indices] = False training = vector[mask] elif str(type(vector)) == "<type 'numpy.ndarray'>": training = np.concatenate((vector[:start], vector[end:])) return training, validation@staticmethoddef Cross_Validation(learner, k, examples, labels): train_folds_score = [] validation_folds_score = [] for fold in range(0, k): training_set, validation_set = Cross_Validation.partition(examples, fold, k) training_labels, validation_labels = Cross_Validation.partition(labels, fold, k) learner.fit(training_set, training_labels) training_predicted = learner.predict(training_set) validation_predicted = learner.predict(validation_set) train_folds_score.append(metrics.accuracy_score(training_labels, training_predicted)) validation_folds_score.append(metrics.accuracy_score(validation_labels, validation_predicted)) return train_folds_score, validation_folds_score
learner参数是来自SKlearn库的分类器,k是折数,examples是CountVectorizer(也是SKlearn)生成的稀疏矩阵,表示词袋模型。例如:
from sklearn.feature_extraction.text import CountVectorizerfrom sklearn.naive_bayes import MultinomialNBfrom Cross_Validation import Cross_Validation as cvvectorizer = CountVectorizer(stop_words='english', lowercase=True, min_df=2, analyzer="word")data = vectorizer.fit_transform("""textual data""")clfMNB = MultinomialNB(alpha=.0001)score = cv.Cross_Validation(clfMNB, 10, data, labels)print "Train score" + str(score[0])print "Test score" + str(score[1])
我认为在某个地方存在逻辑错误,因为训练集上的得分是95%(如预期),但在测试集上的得分实际上为0,但我找不到它。
希望我已经解释清楚了。提前感谢。
________________________________EDIT___________________________________
这是将文本加载到可以传递给向量化的向量中的代码。它还返回标签向量。
from nltk.tokenize import word_tokenizefrom Categories_Data import categoriesimport numpy as npimport codecsimport globimport osimport reclass Data_Preprocessor:def tokenize(self, text): tokens = word_tokenize(text) alpha = [t for t in tokens if unicode(t).isalpha()] return alphadef header_not_fully_removed(self, text): if ":" in text.splitlines()[0]: return len(text.splitlines()[0].split(":")[0].split()) == 1 else: return Falsedef strip_newsgroup_header(self, text): _before, _blankline, after = text.partition('\n\n') if len(after) > 0 and self.header_not_fully_removed(after): after = self.strip_newsgroup_header(after) return afterdef strip_newsgroup_quoting(self, text): _QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:'r'|^In article|^Quoted from|^\||^>)') good_lines = [line for line in text.split('\n') if not _QUOTE_RE.search(line)] return '\n'.join(good_lines)def strip_newsgroup_footer(self, text): lines = text.strip().split('\n') for line_num in range(len(lines) - 1, -1, -1): line = lines[line_num] if line.strip().strip('-') == '': break if line_num > 0: return '\n'.join(lines[:line_num]) else: return textdef raw_to_vector(self, path, to_be_stripped=["header", "footer", "quoting"], noise_threshold=-1): base_dir = os.getcwd() train_data = [] label_data = [] for category in categories: os.chdir(base_dir) os.chdir(path+"/"+category[0]) for filename in glob.glob("*"): with codecs.open(filename, 'r', encoding='utf-8', errors='replace') as target: data = target.read() if "quoting" in to_be_stripped: data = self.strip_newsgroup_quoting(data) if "header" in to_be_stripped: data = self.strip_newsgroup_header(data) if "footer" in to_be_stripped: data = self.strip_newsgroup_footer(data) if len(data) > noise_threshold: train_data.append(data) label_data.append(category[1]) os.chdir(base_dir) return np.array(train_data), np.array(label_data)
这是“from Categories_Data import categories”导入的内容…
categories = [ ('alt.atheism',0), ('comp.graphics',1), ('comp.os.ms-windows.misc',2), ('comp.sys.ibm.pc.hardware',3), ('comp.sys.mac.hardware',4), ('comp.windows.x',5), ('misc.forsale',6), ('rec.autos',7), ('rec.motorcycles',8), ('rec.sport.baseball',9), ('rec.sport.hockey',10), ('sci.crypt',11), ('sci.electronics',12), ('sci.med',13), ('sci.space',14), ('soc.religion.christian',15), ('talk.politics.guns',16), ('talk.politics.mideast',17), ('talk.politics.misc',18), ('talk.religion.misc',19) ]
回答:
你的验证分数低的原因很微妙。
问题在于你如何划分数据集。记住,在进行交叉验证时,你应该随机分割数据集。你缺少的就是这种随机性。
你的数据是按类别加载的,这意味着在你的输入数据集中,类标签和示例是依次排列的。由于没有进行随机分割,你完全移除了一个类,使得你的模型在训练阶段从未见过这个类,因此在测试/验证阶段得到很差的结果。
你可以通过进行随机洗牌来解决这个问题。所以,执行以下操作:
from sklearn.utils import shuffle processor = Data_Preprocessor()td, tl = processor.raw_to_vector(path="C:/Users/Pankaj/Downloads/ng/")vectorizer = CountVectorizer(stop_words='english', lowercase=True, min_df=2, analyzer="word")data = vectorizer.fit_transform(td)# Shuffle the data and labelsdata, tl = shuffle(data, tl, random_state=0)clfMNB = MultinomialNB(alpha=.0001)score = Cross_Validation.Cross_Validation(clfMNB, 10, data, tl)print("Train score" + str(score[0]))print("Test score" + str(score[1]))