支持向量机在Python中与R相比的表现较差

我试图使用Python(scikit-learn大致重现R中C分类SVM交叉验证的表现(e1071包),但Python的预测表现远不及R。给定以下训练和测试数据(这些数据是从更大长度的数据集中平滑得到的),R的预测表现为0.87(1为完美),而Python的表现为0.55,这几乎和随机猜测差不多。请注意,我并不是试图得到完全相同的结果,我只是希望如果R能在相同的数据集上表现得相当好,那么Python也应该可以。我将数据按50-50的比例分成了训练集和测试集,并且试图从浮点数中预测二项结果。下面给出了R和Python的代码。我检查的所有默认SVM参数在R和Python之间都是相同的(gamma, C(cost), shrinking, tol等)。

R代码:

library("e1071")data <- c(-108.604150711185, -131.880188127745, -18.3017441809734, 32.011639982337, -71.6651360870381, -107.587087751331, 21.316311739316, -36.015324564807, 138.22302265079, 47.9322592065447, -129.007749732555, -150.41808326425, -141.00589707504, -105.912063885407, 76.2956568174239, 141.457541434218, -20.6676395937811, -226.505644333494, -151.229861588686, -160.18717733968, -107.01667849677, -7.52794131287047, -93.1147621027003, 5.59630172385392, 38.741091785708, -32.9061390503546, -78.5031246062325, -9.64080356337477, -54.1430873201472, -108.127067430103, -12.2589074567133, 129.212940940854, 132.670728015743, 107.075153550768, 167.176831103164, -20.6839530330714, 102.677911281291, -109.423698849103, -154.454318421757, 140.52342226202, 110.184351332211, -16.6842057565239, -11.1688984829787, 178.441845032635, 37.0689292040101, 166.610506783818, -79.2764182099804, 99.1136693164655, 82.0929274697289, 15.1752041486536, 178.489001782771, 145.332200036106, -185.977800430997, -90.5440753976243, 78.0459300120412, 144.297553387967, 99.5945824957091, 110.803195137024, 81.3094331750562, -396.825240330405, -166.038928089807, -78.863983688682, 138.309908804212, -148.647304302406, -2.23135233624276, 129.411511929621, -111.664324254549, -96.4151180340831, 129.219227225386, 90.7050615157428, 141.986869866474, 93.0147970463941, 142.807435791073, -75.8426755946232, 122.537973092667, 117.078515092191, 134.166968023265, 90.8512172789568, 146.367129646428, 125.539182526718, -70.485058023267, -46.967575223949, 116.210349687502, -91.2992704167832, 104.052231138142, -114.580693287221, -82.9991067628608, -111.649187979413)class <- as.factor(c(0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0))df_training <- data.frame(data, class)data <- c(133.75999742845, 22.9386702890105, -126.959902277009, -116.317935595297, -33.9418594804197, -49.0102540773413, -159.266630498512, -8.92296705690401, 114.328300224712, 66.0706175847251, -154.385344188283, 70.7868284982941, -28.334490887314, 118.755307949047, 154.362286178401, 101.331675190569, 96.2196681290104, 99.5694296232446, 210.160787371823, 65.8474210711036, -125.475676456606, 66.7541385125748, -161.001356357477, -40.1416817172267, 38.6877489907967, -7.12706914419719, -10.3967176519225, -80.6831091111636, 128.604227270616, 75.4219966516171, 184.951786958864, 90.9170782990185, 66.7190886024699, 81.377280661573, -82.4053965286415, -65.6718687269108, 61.1679518726262, 190.532649096311, 199.917670153196, 104.558442558929, 113.747065157369, 106.640501329133, 80.593201532054, 75.0176280888154, 155.538654396817, 30.0548798029353, 116.900219512636, 131.431417509576, 33.3308447581156, -121.191534016935, -80.4203785670198, 157.737407847885, 66.5956228628815, 50.8340706561446, -113.713450848071, -18.7787225270887, 113.832326071127, -45.5884280143408, 221.782395098832, 70.1660982367319, 235.005982636939, 80.8180320055801, -74.7107276814795, 133.925782624001, 97.9261686360971, -127.954532027281, 58.9295075974962, 96.1702797891484, -49.6048543914143, -42.1842037639683, -235.694708213157, 13.4862841916787, 126.396462591781, 214.297316240176, 125.148658464391, 84.8887673204376, 78.2717096234718, 139.677936314095, -168.649300541479, 103.40253638232, 69.2727189156141, 153.017155534869, -238.07168745534, -166.929968475244, 113.414489211719, 85.5520123243496, 120.582346886614, -214.850084749638, 96.8090523924549)class <- as.factor(c(1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1))df_test <- data.frame(data, class)#train modelbest.svm <- best.tune(svm,                      class~data,                      data=df_training,kernel = 'radial',cost = 1, gamma = 0.01,                      type = "C-classification")#make predictionsTrainingPredictions<-predict(best.svm,df_training,type="class")TestPredictions <- predict(best.svm,df_test,type="class")Skill = sum(TestPredictions==df_test[[c('class')]])/length(TestPredictions)print(Skill) #value is 0.87

Python代码:

import numpy as npfrom sklearn.svm import SVCData = np.array([-108.604150711185, -131.880188127745,-18.3017441809734, 32.011639982337, -71.6651360870381, -107.587087751331, 21.316311739316, -36.015324564807, 138.22302265079, 47.9322592065447, -129.007749732555, -150.41808326425, -141.00589707504, -105.912063885407, 76.2956568174239, 141.457541434218, -20.6676395937811, -226.505644333494, -151.229861588686, -160.18717733968, -107.01667849677, -7.52794131287047, -93.1147621027003, 5.59630172385392, 38.741091785708, -32.9061390503546, -78.5031246062325, -9.64080356337477, -54.1430873201472, -108.127067430103, -12.2589074567133, 129.212940940854, 132.670728015743, 107.075153550768, 167.176831103164, -20.6839530330714, 102.677911281291, -109.423698849103, -154.454318421757, 140.52342226202, 110.184351332211, -16.6842057565239, -11.1688984829787, 178.441845032635, 37.0689292040101, 166.610506783818, -79.2764182099804, 99.1136693164655, 82.0929274697289, 15.1752041486536, 178.489001782771, 145.332200036106, -185.977800430997, -90.5440753976243, 78.0459300120412, 144.297553387967, 99.5945824957091, 110.803195137024, 81.3094331750562,-396.825240330405, -166.038928089807, -78.863983688682, 138.309908804212, -148.647304302406, -2.23135233624276, 129.411511929621, -111.664324254549, -96.4151180340831, 129.219227225386, 90.7050615157428, 141.986869866474, 93.0147970463941, 142.807435791073, -75.8426755946232, 122.537973092667, 117.078515092191, 134.166968023265, 90.8512172789568, 146.367129646428, 125.539182526718, -70.485058023267, -46.967575223949, 116.210349687502, -91.2992704167832, 104.052231138142, -114.580693287221, -82.9991067628608, -111.649187979413])Class = np.array([0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0])df_training = np.array([Data, Class])Data = np.array([133.75999742845, 22.9386702890105, -126.959902277009, -116.317935595297, -33.9418594804197, -49.0102540773413, -159.266630498512, -8.92296705690401, 114.328300224712, 66.0706175847251, -154.385344188283, 70.7868284982941, -28.334490887314, 118.755307949047, 154.362286178401, 101.331675190569, 96.2196681290104, 99.5694296232446, 210.160787371823, 65.8474210711036, -125.475676456606, 66.7541385125748, -161.001356357477, -40.1416817172267, 38.6877489907967, -7.12706914419719, -10.3967176519225, -80.6831091111636, 128.604227270616, 75.4219966516171, 184.951786958864, 90.9170782990185, 66.7190886024699, 81.377280661573, -82.4053965286415, -65.6718687269108, 61.1679518726262, 190.532649096311, 199.917670153196, 104.558442558929, 113.747065157369, 106.640501329133,80.593201532054, 75.0176280888154, 155.538654396817, 30.0548798029353, 116.900219512636, 131.431417509576, 33.3308447581156, -121.191534016935, -80.4203785670198, 157.737407847885, 66.5956228628815, 50.8340706561446, -113.713450848071, -18.7787225270887, 113.832326071127, -45.5884280143408, 221.782395098832, 70.1660982367319, 235.005982636939, 80.8180320055801, -74.7107276814795, 133.925782624001, 97.9261686360971, -127.954532027281, 58.9295075974962, 96.1702797891484, -49.6048543914143, -42.1842037639683, -235.694708213157, 13.4862841916787, 126.396462591781, 214.297316240176, 125.148658464391, 84.8887673204376, 78.2717096234718, 139.677936314095, -168.649300541479, 103.40253638232, 69.2727189156141, 153.017155534869, -238.07168745534, -166.929968475244, 113.414489211719,85.5520123243496, 120.582346886614, -214.850084749638, 96.8090523924549])Class = np.array([1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1])df_test = np.array([Data, Class])# train model                                                                                                                                                 clf = SVC(verbose=True, gamma=0.01, kernel='rbf', C=1)                                                     # make predictionsclf.fit(df_training[0].reshape(88,1), df_training[1].reshape(88,1))TrainingPredictions = clf.predict(df_training[0].reshape(88,1))TestPredictions = clf.predict(df_test[0].reshape(89,1))Skill = np.sum(TestPredictions==df_test[1])/float(len(TestPredictions))print Skill #value is 0.55

回答:

这种观察到的差异可能源于在R中,svm()函数默认会对数据进行缩放(参见文档,第6页)。

如果你使用scikit-learn的StandardScaler,你最终会得到一个与你在R中得到的结果非常接近的结果:

import numpy as npfrom sklearn.svm import SVCfrom sklearn.preprocessing import StandardScalerscaler = StandardScaler()Data = np.array([-108.604150711185, -131.880188127745,-18.3017441809734, 32.011639982337, -71.6651360870381, -107.587087751331, 21.316311739316, -36.015324564807, 138.22302265079, 47.9322592065447, -129.007749732555, -150.41808326425, -141.00589707504, -105.912063885407, 76.2956568174239, 141.457541434218, -20.6676395937811, -226.505644333494, -151.229861588686, -160.18717733968, -107.01667849677, -7.52794131287047, -93.1147621027003, 5.59630172385392, 38.741091785708, -32.9061390503546, -78.5031246062325, -9.64080356337477, -54.1430873201472, -108.127067430103, -12.2589074567133, 129.212940940854, 132.670728015743, 107.075153550768, 167.176831103164, -20.6839530330714, 102.677911281291, -109.423698849103, -154.454318421757, 140.52342226202, 110.184351332211, -16.6842057565239, -11.1688984829787, 178.441845032635, 37.0689292040101, 166.610506783818, -79.2764182099804, 99.1136693164655, 82.0929274697289, 15.1752041486536, 178.489001782771, 145.332200036106, -185.977800430997, -90.5440753976243, 78.0459300120412, 144.297553387967, 99.5945824957091, 110.803195137024, 81.3094331750562,-396.825240330405, -166.038928089807, -78.863983688682, 138.309908804212, -148.647304302406, -2.23135233624276, 129.411511929621, -111.664324254549, -96.4151180340831, 129.219227225386, 90.7050615157428, 141.986869866474, 93.0147970463941, 142.807435791073, -75.8426755946232, 122.537973092667, 117.078515092191, 134.166968023265, 90.8512172789568, 146.367129646428, 125.539182526718, -70.485058023267, -46.967575223949, 116.210349687502, -91.2992704167832, 104.052231138142, -114.580693287221, -82.9991067628608, -111.649187979413])Data = scaler.fit_transform(Data)Class = np.array([0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0])df_training = np.array([Data, Class])Data = np.array([133.75999742845, 22.9386702890105, -126.959902277009, -116.317935595297, -33.9418594804197, -49.0102540773413, -159.266630498512, -8.92296705690401, 114.328300224712, 66.0706175847251, -154.385344188283, 70.7868284982941, -28.334490887314, 118.755307949047, 154.362286178401, 101.331675190569, 96.2196681290104, 99.5694296232446, 210.160787371823, 65.8474210711036, -125.475676456606, 66.7541385125748, -161.001356357477, -40.1416817172267, 38.6877489907967, -7.12706914419719, -10.3967176519225, -80.6831091111636, 128.604227270616, 75.4219966516171, 184.951786958864, 90.9170782990185, 66.7190886024699, 81.377280661573, -82.4053965286415, -65.6718687269108, 61.1679518726262, 190.532649096311, 199.917670153196, 104.558442558929, 113.747065157369, 106.640501329133,80.593201532054, 75.0176280888154, 155.538654396817, 30.0548798029353, 116.900219512636, 131.431417509576, 33.3308447581156, -121.191534016935, -80.4203785670198, 157.737407847885, 66.5956228628815, 50.8340706561446, -113.713450848071, -18.7787225270887, 113.832326071127, -45.5884280143408, 221.782395098832, 70.1660982367319, 235.005982636939, 80.8180320055801, -74.7107276814795, 133.925782624001, 97.9261686360971, -127.954532027281, 58.9295075974962, 96.1702797891484, -49.6048543914143, -42.1842037639683, -235.694708213157, 13.4862841916787, 126.396462591781, 214.297316240176, 125.148658464391, 84.8887673204376, 78.2717096234718, 139.677936314095, -168.649300541479, 103.40253638232, 69.2727189156141, 153.017155534869, -238.07168745534, -166.929968475244, 113.414489211719,85.5520123243496, 120.582346886614, -214.850084749638, 96.8090523924549])Data = scaler.fit_transform(Data)Class = np.array([1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1])df_test = np.array([Data, Class])# train model                                                                                                                                                 clf = SVC(verbose=True, gamma=0.01, kernel='rbf', C=1)                                                     # make predictionsclf.fit(df_training[0].reshape(88,1), df_training[1].reshape(88,1))TrainingPredictions = clf.predict(df_training[0].reshape(88,1))TestPredictions = clf.predict(df_test[0].reshape(89,1))Skill = np.sum(TestPredictions==df_test[1])/float(len(TestPredictions))print("Skill: "+str(Skill)) #value is 0.84

Related Posts

L1-L2正则化的不同系数

我想对网络的权重同时应用L1和L2正则化。然而,我找不…

使用scikit-learn的无监督方法将列表分类成不同组别,有没有办法?

我有一系列实例,每个实例都有一份列表,代表它所遵循的不…

f1_score metric in lightgbm

我想使用自定义指标f1_score来训练一个lgb模型…

通过相关系数矩阵进行特征选择

我在测试不同的算法时,如逻辑回归、高斯朴素贝叶斯、随机…

可以将机器学习库用于流式输入和输出吗?

已关闭。此问题需要更加聚焦。目前不接受回答。 想要改进…

在TensorFlow中,queue.dequeue_up_to()方法的用途是什么?

我对这个方法感到非常困惑,特别是当我发现这个令人费解的…

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注