我正在尝试解决一个Twitter情感分析问题。我使用的代码如下:
print()print("正在导入")print()#导入from __future__ import print_functionimport pandas as pdimport numpy as npimport reimport nltknltk.download('stopwords')from nltk.corpus import stopwordsfrom nltk.stem.porter import PorterStemmerfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn import metricsfrom sklearn import model_selectionfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.decomposition import PCAfrom sklearn.cross_validation import train_test_splitfrom sklearn.model_selection import GridSearchCVdef getting_data(train_dataset_name, test_dataset_name): print() print("正在获取数据") print() #参数名称自解释 - 数据集的文件名 #假设您在包含数据集的目录中执行此代码语句 train = pd.read_csv(train_dataset_name).values train_y = train[:,1] train_x = train[:,2] test = pd.read_csv(test_dataset_name).values test = test[:,1] test = np.reshape(test,(test.shape[0],1)) return train_x,train_y,testdef bagOfWords(test,train_x): print() print("正在创建词袋模型") print() #创建并返回测试和训练x的词袋版本 #训练转换 corpus_train = [] for i in range(0,train_x.shape[0]): review = re.sub('[^a-zA-Z]', ' ', train_x[i]) review = review.lower().split() ps = PorterStemmer() review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] review = ' '.join(review) corpus_train.append(review) #测试转换 corpus_test = [] for i in range(0,test.shape[0]): review = re.sub('[^a-zA-Z]', ' ', test[i][0]) review = review.lower().split() ps = PorterStemmer() review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] review = ' '.join(review) corpus_test.append(review) return corpus_train,corpus_testdef dimensionality_reduction(corpus_train,corpus_test, return_ratio, components): print() print("正在进行降维") print() #CountVectorizer cv = CountVectorizer(max_features = 1500) train_x = cv.fit_transform(corpus_train).toarray() #PCA pca = PCA(n_components=components) train_x = pca.fit_transform(train_x) explained_variance = pca.explained_variance_ratio_ test = cv.transform(corpus_test).toarray() test = pca.transform(test) test = test.astype('float32') if (return_ratio): return train_x,test, explained_variance else: return train_x,testdef getOptimumParameters(train_x,train_y, return_stats): print() print("正在获取最佳参数") print("此优化算法可能需要一些时间,请耐心等待。") print("请在运行期间不要进行其他任务。") print() train_x = train_x.astype('float32') train_y = train_y.astype('float32') classifier = KNeighborsClassifier() classifier.fit(train_x,train_y) #为了我的程序,我使用了自己的参数列表。 #如果您使用此代码,请更改它们 neighbor_list = [1,3,6,9,12,15,18,21,25] algorithm_list = ['brute', 'kd_tree', 'ball_tree'] weights_list = ['uniform', 'distance'] p_list = [1] #p_list = [1,2,3,4] leaf_list = [10,15,20,25,30,35,40,45,50] parameters = [{'n_neighbors':neighbor_list, 'weights':weights_list, 'algorithm':algorithm_list, 'p':p_list, 'leaf_size':leaf_list}] clf = GridSearchCV(estimator=classifier, param_grid = parameters, cv=5,refit=True, error_score=0, n_jobs = -1) clf = clf.fit(train_x,train_y) bc = clf.best_score_ bp = clf.best_params_ if return_stats: return clf, bc, bp else: return clfdef predictions(classifier, train_x, train_y, test, ratio): print() print("正在进行预测") print() #更改类型以与分类器一起使用 train_x= train_x.astype('float32') train_y = train_y.astype('float32') #将训练集拆分为训练集和开发集 train_x,dev_x,train_y,dev_y = train_test_split(train_x,train_y,test_size = ratio, random_state=0) #进行预测 test = test.astype('float32') pred = classifier.predict(test) return preddef convertPredToCsv(pred, csv_name): df = pd.DataFrame(pred) df.index.name = 'id' df.columns = ['label'] df.to_csv("predictions.csv")def main(): #获取数据 train_x,train_y,test = getting_data('train.csv', 'test_tweets.csv') #构建词袋模型 corpus_train,corpus_test = bagOfWords(test,train_x) #进行降维 train_x,test = dimensionality_reduction(corpus_train,corpus_test,False,350) #获取最佳分类器 classifier= getOptimumParameters(train_x,train_y, False) #预测并转换为csv pred = predictions(classifier, train_x, train_y, test, 0.1) convertPredToCsv(pred, 'predictions.csv')if __name__ == "__main__": main()
每次运行到getOptimumParameters函数时,我都会遇到许多错误。有些错误提示是属性错误,但大多数错误我找不到错误名称。我认为大多数其他错误都是为了引导我找到属性错误。我无法弄清楚为什么会发生这个错误。我知道我的GridSearch有问题,但我不知道是参数有问题(我已经三次检查过了,没有发现任何问题),还是有其他问题。任何帮助都将不胜感激。谢谢。
D:\Anaconda\lib\site-packages\numpy\core\fromnumeric.py in _wrapfunc(obj=array([[ 0. , 30.70562651, 27.84020028, .... 38.11465899, 25.22553572, 0. ]]), method='argpartition', *args=(0,), **kwds={'axis': 1, 'kind': 'introselect', 'order': None}) 47 return result 48 49 50 def _wrapfunc(obj, method, *args, **kwds): 51 try:---> 52 return getattr(obj, method)(*args, **kwds) obj = array([[ 0. , 30.70562651, 27.84020028, .... 38.11465899, 25.22553572, 0. ]]) method = 'argpartition' args = (0,) kwds = {'axis': 1, 'kind': 'introselect', 'order': None} 53 54 # 如果对象的类中没有这样的方法,会发生属性错误 55 # 内存错误:
数据来自我的analyticsvidhya问题。这里是训练数据的下载链接 – 这是一个Dropbox链接。https://www.dropbox.com/s/w4tagiewcuoxgkt/train.csv?dl=0
这是测试数据的链接:https://www.dropbox.com/s/qiitwlpnkbs2c3m/test_tweets.csv?dl=0
谢谢。
回答:
我知道已经过了一段时间,很抱歉。
只是想让大家知道,对于长时间的网格搜索,至少对于Windows用户来说,导入的是
sklearn.model_selection.GridSearchCV
而实际上应该导入的是
sklearn.grid_search.GridSearchCV
前者几乎总是会抛出内存错误,而后者即使在长时间的网格搜索中也能正常工作。