我有以下训练集:
Text,y MRR 93345,1 MRR 93434,1 MRR 93554,1 MRR 938900,1 MRR 93970,1 MRR 937899,1 MRR 93868,1 MRR 938769,1 MRR 93930,1 MRR 92325,1 MRR 931932,1 MRR 933922,1 MRR 934390,1 MRR 93204,1 MRR 93023,1 MRR 930982,1 MRR 87678,-1 MRR 87956,-1 MRR 87890,-1 MRR 878770,-1 MRR 877886,-1 MRR 87678367,-1 MRR 8790,-1 MRR 87345,-1 MRR 87149,-1 MRR 873790,-1 MRR 873493,-1 MRR 874303,-1 MRR 874343,-1 MRR 874304,-1 MRR 879034,-1 MRR 879430,-1 MRR 87943,-1 MRR 879434,-1 MRR 871984,-1 MRR 873949,-1
我的代码如下:
# 创建文档词矩阵dtMatrix <- create_matrix(data["Text"],language="english", removePunctuation=TRUE, stripWhitespace=TRUE, toLower=TRUE, removeStopwords=TRUE, stemWords=TRUE, removeSparseTerms=.998) # 配置训练数据container <- create_container(dtMatrix, data$y, trainSize=1:nrow(dtMatrix), virgin=FALSE) # 训练SVM模型model <- train_model(container, "SVM", kernel="linear" ,cost=1)# 新数据predictionData <- list("MRR 93111") # 创建预测文档词矩阵 predMatrix <- create_matrix(predictionData, originalMatrix=dtMatrix,language="english", removePunctuation=TRUE, stripWhitespace=TRUE, toLower=TRUE, removeStopwords=TRUE, stemWords=TRUE, removeSparseTerms=.998) # 创建相应的容器predSize = length(predictionData);predictionContainer <- create_container(predMatrix, labels=rep(0,predSize), testSize=1:predSize, virgin=FALSE) # 预测results <- classify_model(predictionContainer, model)
现在我想使用train_model函数来预测:MRR 93111的y值为1。这意味着如果字符串以”MRR 93″开头,输出应为1,而以”MRR 87″开头的则输出-1。实际上它不起作用,因为我得到了MRR 93111 -1 0.5778781
此外,似乎如果我以不同的方式整理训练集…或者如果我对同一数据集多次运行脚本,结果会发生变化,这对我来说听起来很奇怪。
更新1: dput(data)
structure(list(Text = structure(c(26L, 28L, 30L, 34L, 36L, 31L, 32L, 33L, 35L, 21L, 24L, 27L, 29L, 25L, 22L, 23L, 10L, 20L, 14L, 13L, 12L, 11L, 15L, 3L, 1L, 5L, 4L, 7L, 9L, 8L, 16L, 18L, 17L, 19L, 2L, 6L), .Label = c("MRR 87149", "MRR 871984", "MRR 87345", "MRR 873493", "MRR 873790", "MRR 873949", "MRR 874303", "MRR 874304", "MRR 874343", "MRR 87678", "MRR 87678367", "MRR 877886", "MRR 878770", "MRR 87890", "MRR 8790", "MRR 879034", "MRR 87943", "MRR 879430", "MRR 879434", "MRR 87956", "MRR 92325", "MRR 93023", "MRR 930982", "MRR 931932", "MRR 93204", "MRR 93345", "MRR 933922", "MRR 93434", "MRR 934390", "MRR 93554", "MRR 937899", "MRR 93868", "MRR 938769", "MRR 938900", "MRR 93930", "MRR 93970"), class = "factor"), Y = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L)), .Names = c("Text", "Y"), class = "data.frame", row.names = c(NA, -36L))
回答:
你的问题在于你的代码在词级别上使用了训练数据和分类。
> dtMatrix$dimnames$Terms [1] "87149" "871984" "87345" "873493" "873790" "873949" "874303" "874304" "874343" "87678" "87678367"[12] "877886" "878770" "87890" "8790" "879034" "87943" "879430" "879434" "87956" "92325" "93023" [23] "930982" "93111" "931932" "93204" "93345" "933922" "93434" "934390" "93554" "937899" "93868" [34] "938769" "938900" "93930" "93970" "mrr"
我不是很确定SVM是如何处理这些数字字符串的,但它似乎不太在意字符串中的93部分。将字符串拆分成字符会对各个数字赋予更大的权重:
df$Text <- sapply(1:length(df$Text), function(i) paste(unlist(strsplit(df$Text[i], split = "")), collapse = " "))
我使用df代替data,因为data在RTextTools中已经是一个对象,在运行你的代码时给我带来了一些问题。在创建矩阵时,必须更改最小词长度选项。
dtMatrix <- create_matrix(df$Text,language="english", minWordLength=1, #! removePunctuation=TRUE, stripWhitespace=TRUE, toLower=TRUE, removeStopwords=TRUE, stemWords=TRUE, removeSparseTerms=.998)
现在我们得到:
> dtMatrix$dimnames$Terms
[1] “0” “1” “2” “3” “4” “5” “6” “7” “8” “9” “m” “r”
更重要的是:
> results SVM_LABEL SVM_PROB1 1 0.9144185
我最近参加了一个关于RTextTools和SVM的研讨会,他们提到每次训练模型时,SVM会得到略有不同的结果。我不太确定为什么,所以我不会尝试解释,但我们被推荐了一本名为《An Introduction to Statistical Learning with Applications in R》的免费书籍,以了解更多关于支持向量机的信息。
这是完整的代码:
df <- structure(list(Text = structure(c(26L, 28L, 30L, 34L, 36L, 31L, 32L, 33L, 35L, 21L, 24L, 27L, 29L, 25L, 22L, 23L, 10L, 20L, 14L, 13L, 12L, 11L, 15L, 3L, 1L, 5L, 4L, 7L, 9L, 8L, 16L, 18L, 17L, 19L, 2L, 6L), .Label = c("MRR 87149", "MRR 871984", "MRR 87345", "MRR 873493", "MRR 873790", "MRR 873949", "MRR 874303", "MRR 874304", "MRR 874343", "MRR 87678", "MRR 87678367", "MRR 877886", "MRR 878770", "MRR 87890", "MRR 8790", "MRR 879034", "MRR 87943", "MRR 879430", "MRR 879434", "MRR 87956", "MRR 92325", "MRR 93023", "MRR 930982", "MRR 931932", "MRR 93204", "MRR 93345", "MRR 933922", "MRR 93434", "MRR 934390", "MRR 93554", "MRR 937899", "MRR 93868", "MRR 938769", "MRR 938900", "MRR 93930", "MRR 93970"), class = "factor"), Y = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L)), .Names = c("Text", "Y"), class = "data.frame", row.names = c(NA, -36L))df$Text <- as.character(df$Text) # 新数据df[nrow(df)+1,] <- c("MRR 93111","")df$Text <- sapply(1:length(df$Text), function(i) paste(unlist(strsplit(df$Text[i], split = "")), collapse = " "))# 创建文档词矩阵dtMatrix <- create_matrix(df$Text,language="english", minWordLength=1, removePunctuation=TRUE, stripWhitespace=TRUE, toLower=TRUE, removeStopwords=TRUE, stemWords=TRUE, removeSparseTerms=.998) dtMatrix$dimnames$TermsdtMatrix$dimnames$Docs# 配置训练数据container <- create_container(dtMatrix, df$Y, trainSize=1:36, testSize = 37, virgin=TRUE) container <- create_container(dtMatrix, labels=df$Y, trainSize=1:36, testSize = 37, virgin=TRUE)# 训练SVM模型model <- train_model(container, "SVM",kernel="linear" ,cost=1) ##??results <- classify_model(container,model)results