我在一个包含41万行的庞大数据集上实现了朴素贝叶斯算法。现在所有记录都能正确分类,但问题是程序需要将近一个小时才能将记录写入相应的文件。有什么方法可以提高我的代码性能吗?以下是代码片段。这段代码将41万条记录写入相应的文件。谢谢你。
fp=fopen("sales_ok_fraud.txt","r"); while(fgets(line,80,fp)!=NULL) //从文件中读取每一行以计算文件大小。 { token = strtok(line,","); token = strtok(NULL,","); token = strtok(NULL,","); token = strtok(NULL,","); token = strtok(NULL,","); token = strtok(NULL,","); token1 = strtok(token,"\n"); memcpy(mystr,&token1[0],strlen(token1)-1); mystr[strlen(token1)-1] = '\0'; if( strcmp(mystr,"ok") == 0 ) counter_ok++; else counter_fraud++; } printf("The no. of records with OK label are %f\n",counter_ok); printf("The no. of records with FRAUD label are %f\n",counter_fraud); prblty_ok = counter_ok/(counter_ok+counter_fraud); prblty_fraud = counter_fraud/(counter_ok+counter_fraud); printf("The probability of OK records is %f\n",prblty_ok); printf("The probability of FRAUD records is %f\n",prblty_fraud); fclose(fp); fp=fopen("sales_unknwn.txt","r"); fp2=fopen("sales_unknown_ok_classified.txt","a"); fp3=fopen("sales_unknown_fraud_classified.txt","a"); while(fgets(line1,80,fp)!=NULL) //从文件中读取每一行以计算文件大小。 { unknwn_attr1 = strtok(line1,","); unknwn_attr2 = strtok(NULL,","); unknwn_attr3 = strtok(NULL,","); unknwn_attr4 = strtok(NULL,","); unknwn_attr5 = strtok(NULL,","); //printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5); fp1=fopen("sales_ok_fraud.txt","r"); while(fgets(line,80,fp1)!=NULL) //从文件中读取每一行以计算文件大小。 { ok_fraud_attr1 = strtok(line,","); ok_fraud_attr2 = strtok(NULL,","); ok_fraud_attr3 = strtok(NULL,","); ok_fraud_attr4 = strtok(NULL,","); ok_fraud_attr5 = strtok(NULL,","); ok_fraud_attr6 = strtok(NULL,","); memcpy(ok_fraud_attr6_str,&ok_fraud_attr6[0],strlen(ok_fraud_attr6)-2); ok_fraud_attr6_str[strlen(ok_fraud_attr6)-2] = '\0'; //ok_fraud_attr6[strlen(ok_fraud_attr6)-2] = '\0'; //printf("Testing ok_fraud_attr6 - %s-%d\n",ok_fraud_attr6_str,strlen(ok_fraud_attr6_str)); if( strcmp(ok_fraud_attr6_str,"ok") == 0 ) { if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 ) counter_ok_attr2++; if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 ) counter_ok_attr3++; if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 ) counter_ok_attr4++; if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 ) counter_ok_attr5++; } if( strcmp(ok_fraud_attr6_str,"fraud") == 0 ) { if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 ) counter_fraud_attr2++; if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 ) counter_fraud_attr3++; if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 ) counter_fraud_attr4++; if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 ) counter_fraud_attr5++; } } fclose(fp1); if(counter_ok_attr2 == 0) prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value); else prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok); if(counter_ok_attr3 == 0) prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value); else prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok); if(counter_ok_attr4 == 0) prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value); else prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok); if(counter_ok_attr5 == 0) prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value); else prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok); if(counter_fraud_attr2 == 0) prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value); else prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud); if(counter_fraud_attr3 == 0) prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value); else prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud); if(counter_fraud_attr4 == 0) prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value); else prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud); if(counter_fraud_attr5 == 0) prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value); else prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud); total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok; total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;// printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);// printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);// printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);// printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);// printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud); if(total_prblty_ok > total_prblty_fraud) { fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5); } else { fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5); } counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0; counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0; } fclose(fp); fclose(fp2); fclose(fp3);
回答:
我可以立即看到几件可以做的事情,按照我会尝试的顺序排列:
- 停止对输出文件反复进行打开-写入-关闭、打开-写入-关闭的操作。它们的名称是固定的且有限的。在开始时一次性适当打开所有文件,然后在完成后刷新并关闭它们。
- 有几个逻辑结构可以大幅简化。
- 你的
strlen()
泛滥需要大幅减少。大多数像样的优化编译器会检测到源数据未变,并优化掉对已知未变的字符指针的后续调用,所以我会把这项放在最后(但老实说,我还是会做,因为在相同数据上反复调用strlen()
是一种不好的做法)。 - 在与OP对话后添加:你反复重新解析同一个数据文件(sales_ok_fraud.txt),每次处理sales_unknwn.txt中的一行数据。如果sales_ok_fraud.txt可以装入内存,那么12GB/每行长度就是大量不必要的重复解析。加载该数据一次,计算其基本统计数据一次,并在剩余的数据处理中使用这些数据和统计信息。
逻辑简化
你可以在一个特定的地方大幅减少工作量,将以下代码:
if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0) counter_ok_attr2++; if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0) counter_ok_attr3++; if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0) counter_ok_attr4++; if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0) counter_ok_attr5++; if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0) counter_fraud_attr2++; if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0) counter_fraud_attr3++; if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0) counter_fraud_attr4++; if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0) counter_fraud_attr5++;
改为以下代码:
if (strcmp(ok_fraud_attr6_str, "ok") == 0) { if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0) counter_ok_attr2++; if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 ) counter_ok_attr3++; if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0) counter_ok_attr4++; if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0) counter_ok_attr5++; } else if (strcmp(ok_fraud_attr6_str,"fraud") == 0) { if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0) counter_fraud_attr2++; if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0) counter_fraud_attr3++; if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0) counter_fraud_attr4++; if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0) counter_fraud_attr5++; }
前置加载sales_ok_fraud.txt
以下方法依赖于你的sales_ok_fraud.txt
统计文件的数据格式的完整性,同时尽可能严格地验证该格式。它分配足够的内存来容纳整个文件加上一个字符,以便将整个内容视为一个以null结尾的字符串。然后通过与你之前类似的通用算法将该缓冲区分成片段。结果将是一个指向固定长度字符指针数组的指针表,然后可以在你当前(并反复)打开、解析、使用和丢弃所有这些内容的地方迭代使用它。
// 声明一个包含六个字符串指针的数组typedef char *OFAttribs[6];// 加载一个由以下格式组成的表://// str1,str2,str3,str4,str5,str6\n// str1,str2,str3,str4,str5,str6\n// ...// str1,str2,str3,str4,str5,str6//// 任何偏离上述格式的情况都将导致循环提前终止,// 但会返回在失败点之前能够解析的所有内容。// 因此调用者应始终`free()`结果表和数据指针。size_t init_ok_fraud_data(const char *fname, OFAttribs **ppTable, char **ppTableData){ if (!fname || !*fname) return 0; // 检查文件是否能成功打开 FILE *fp = fopen(fname, "rb"); if (!fp) return 0; // 分配足够的内存来容纳整个文件,加上一个终止符 fseek(fp, 0, SEEK_END); long len = ftell(fp); fseek(fp, 0, SEEK_SET); // 为整个文件加上终止符分配足够的内存 OFAttribs *pTable = NULL; size_t nTableLen = 0; char *pTableData = malloc((len+1) * sizeof(char)); if (NULL != pTableData) { fread(pTableData , len, 1, fp); pTableData[len] = 0; } // 不再需要文件 fclose(fp); // 初始化第一个标记 char *token = strtok(pTableData, ","); while (token) { // 读取下一行标记 OFAttribs attribs = { NULL }; for (int i=0;i<4 && token; ++i) { attribs[i] = token; token = strtok(NULL, ","); } // 填充0..3,设置最后一个标记并继续 if (attribs[3] && token) { // 设置倒数第二个条目 attribs[4] = token; // 行条目仅由换行符终止 token = strtok(NULL, "\n"); if (token) { // 正确的格式。6个参数,5个逗号,一个换行符。 attribs[5] = token; size_t slen = strlen(token); if (slen > 0) { while (isspace(token[--slen])) token[slen] = 0; } // 在主列表上为另一个条目分配空间 OFAttribs *tmp = realloc(pTable, sizeof(*tmp) * (nTableLen+1)); if (NULL != tmp) { pTable = tmp; memcpy(pTable + nTableLen++, attribs, sizeof(attribs)); } else { // 分配失败。 printf("Error allocating memory for expanding OKFraud data set"); exit(EXIT_FAILURE); } } else { // 不正确。 printf("Invalid line format detected. Expected ok/fraud\\n"); break; } // 下一个新行的标记 token = strtok(NULL, ","); } } // 设置输出变量 *ppTable = pTable; *ppTableData = pTableData; return nTableLen;}
整合
将上述所有内容整合到你的代码库中,效果如下:
// 一次性加载ok_fraud表。OFAttribs *okfr = NULL;char *okfr_data = NULL;size_t okfr_len = init_ok_fraud_data("sales_ok_fraud.txt", &okfr, &okfr_data);// 遍历表以确定ok和fraud状态的概率。// 注意:这真的应该作为加载器的一部分来完成。for (size_t i=0;i<okfr_len; ++i){ if (0 == strcmp("ok", okfr[i][5])) ++counter_ok; else ++counter_fraud;}printf("The no. of records with OK label are %f\n",counter_ok);printf("The no. of records with FRAUD label are %f\n",counter_fraud);// 计算ok和fraud状态的概率prblty_ok = (float)counter_ok/(float)(okfr_len);prblty_fraud = (float)counter_fraud/(float)(okfr_len);printf("The probability of OK records is %f\n",prblty_ok);printf("The probability of FRAUD records is %f\n",prblty_fraud);fp=fopen("sales_unknwn.txt","r");fp2=fopen("sales_unknown_ok_classified.txt","w");fp3=fopen("sales_unknown_fraud_classified.txt","w");while(fgets(line1,sizeof(line1),fp)!=NULL) //从文件中读取每一行以计算文件大小。{ char *unknwn_attr1 = strtok(line1,","); char *unknwn_attr2 = strtok(NULL,","); char *unknwn_attr3 = strtok(NULL,","); char *unknwn_attr4 = strtok(NULL,","); char *unknwn_attr5 = strtok(NULL,","); //printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5); for (size_t i=0;i<okfr_len; ++i) { if( strcmp(okfr[i][5], "ok") == 0 ) { // ok情况 if( strcmp(unknwn_attr2, okfr[i][1]) == 0 ) counter_ok_attr2++; if( strcmp(unknwn_attr3, okfr[i][2]) == 0 ) counter_ok_attr3++; if( strcmp(unknwn_attr4, okfr[i][3]) == 0 ) counter_ok_attr4++; if( strcmp(unknwn_attr5, okfr[i][4]) == 0 ) counter_ok_attr5++; } else // fraud情况 { if( strcmp(unknwn_attr2, okfr[i][1]) == 0 ) counter_fraud_attr2++; if( strcmp(unknwn_attr3, okfr[i][2]) == 0 ) counter_fraud_attr3++; if( strcmp(unknwn_attr4, okfr[i][3]) == 0 ) counter_fraud_attr4++; if( strcmp(unknwn_attr5, okfr[i][4]) == 0 ) counter_fraud_attr5++; } } if(counter_ok_attr2 == 0) prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value); else prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok); if(counter_ok_attr3 == 0) prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value); else prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok); if(counter_ok_attr4 == 0) prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value); else prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok); if (counter_ok_attr5 == 0) prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value); else prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok); if(counter_fraud_attr2 == 0) prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value); else prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud); if(counter_fraud_attr3 == 0) prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value); else prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud); if(counter_fraud_attr4 == 0) prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value); else prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud); if(counter_fraud_attr5 == 0) prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value); else prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud); total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok; total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud; // printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5); // printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5); // printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok); // printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud); // printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud); if(total_prblty_ok > total_prblty_fraud) { fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5); } else { fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5); } counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0; counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;}// 释放表数据和动态指针数组free(okfr);free(okfr_data);fclose(fp);fclose(fp2);fclose(fp3);return 0;
这些只是几个想法。毫无疑问,还有更多可以改进的地方,但这些应该能极大地帮助你以单向扫描和持续输出的方式处理文件,这在这种情况下是尽可能高效的。毫无疑问,结合三大改进:单次文件打开和关闭、逻辑简化以及对sales_ok_fraud.txt文件进行单次解析和缓存,将会极大地提高性能,尤其是前两者和最后一个改进。
编辑 协助OP更新了此处理器,以前置加载sales_ok_fraud.txt文件内容,从而消除了反复加载、解析并立即丢弃15000多行的文本以进行重复解析(每条主源输入行一次)。相应地更新了上面的答案。