如何在C语言中操作文件时提高性能

我在一个包含41万行的庞大数据集上实现了朴素贝叶斯算法。现在所有记录都能正确分类,但问题是程序需要将近一个小时才能将记录写入相应的文件。有什么方法可以提高我的代码性能吗?以下是代码片段。这段代码将41万条记录写入相应的文件。谢谢你。

    fp=fopen("sales_ok_fraud.txt","r");        while(fgets(line,80,fp)!=NULL) //从文件中读取每一行以计算文件大小。        {                token = strtok(line,",");                token = strtok(NULL,",");                token = strtok(NULL,",");                token = strtok(NULL,",");                token = strtok(NULL,",");                token = strtok(NULL,",");                token1 = strtok(token,"\n");                memcpy(mystr,&token1[0],strlen(token1)-1);                mystr[strlen(token1)-1] = '\0';        if( strcmp(mystr,"ok") == 0 )            counter_ok++;        else         counter_fraud++;           }    printf("The no. of records with OK label are %f\n",counter_ok);    printf("The no. of records with FRAUD label are %f\n",counter_fraud);    prblty_ok = counter_ok/(counter_ok+counter_fraud);    prblty_fraud = counter_fraud/(counter_ok+counter_fraud);    printf("The probability of OK records is %f\n",prblty_ok);    printf("The probability of FRAUD records is %f\n",prblty_fraud);    fclose(fp);    fp=fopen("sales_unknwn.txt","r");    fp2=fopen("sales_unknown_ok_classified.txt","a");    fp3=fopen("sales_unknown_fraud_classified.txt","a");    while(fgets(line1,80,fp)!=NULL) //从文件中读取每一行以计算文件大小。        {                unknwn_attr1 = strtok(line1,",");                unknwn_attr2 = strtok(NULL,",");                unknwn_attr3 = strtok(NULL,",");                unknwn_attr4 = strtok(NULL,",");                unknwn_attr5 = strtok(NULL,",");        //printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);        fp1=fopen("sales_ok_fraud.txt","r");        while(fgets(line,80,fp1)!=NULL) //从文件中读取每一行以计算文件大小。            {            ok_fraud_attr1 = strtok(line,",");                    ok_fraud_attr2 = strtok(NULL,",");                    ok_fraud_attr3 = strtok(NULL,",");                    ok_fraud_attr4 = strtok(NULL,",");                    ok_fraud_attr5 = strtok(NULL,",");            ok_fraud_attr6 = strtok(NULL,",");                    memcpy(ok_fraud_attr6_str,&ok_fraud_attr6[0],strlen(ok_fraud_attr6)-2);                    ok_fraud_attr6_str[strlen(ok_fraud_attr6)-2] = '\0';            //ok_fraud_attr6[strlen(ok_fraud_attr6)-2] = '\0';                  //printf("Testing ok_fraud_attr6 - %s-%d\n",ok_fraud_attr6_str,strlen(ok_fraud_attr6_str));             if( strcmp(ok_fraud_attr6_str,"ok") == 0 )            {                if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )                counter_ok_attr2++;                if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )                counter_ok_attr3++;                if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )                counter_ok_attr4++;                if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )                counter_ok_attr5++;            }                    if( strcmp(ok_fraud_attr6_str,"fraud") == 0 )                        {                if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )                counter_fraud_attr2++;                if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )                counter_fraud_attr3++;                if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )                counter_fraud_attr4++;                if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )                counter_fraud_attr5++;            }        }        fclose(fp1);        if(counter_ok_attr2 == 0)        prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);        else        prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok);        if(counter_ok_attr3 == 0)        prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);        else                prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok);        if(counter_ok_attr4 == 0)        prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);        else                prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok);        if(counter_ok_attr5 == 0)        prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);        else                prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok);        if(counter_fraud_attr2 == 0)        prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);        else                prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud);        if(counter_fraud_attr3 == 0)        prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);        else                prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud);        if(counter_fraud_attr4 == 0)        prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);        else                prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud);        if(counter_fraud_attr5 == 0)        prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);        else                prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud);        total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok;        total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;//      printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);//      printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);//      printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);//      printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);//      printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud);        if(total_prblty_ok > total_prblty_fraud)        {            fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);        }           else        {            fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);        }        counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0;        counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;    }    fclose(fp);        fclose(fp2);            fclose(fp3);    

回答:

我可以立即看到几件可以做的事情,按照我会尝试的顺序排列:

  1. 停止对输出文件反复进行打开-写入-关闭、打开-写入-关闭的操作。它们的名称是固定的且有限的。在开始时一次性适当打开所有文件,然后在完成后刷新并关闭它们。
  2. 有几个逻辑结构可以大幅简化。
  3. 你的strlen()泛滥需要大幅减少。大多数像样的优化编译器会检测到源数据未变,并优化掉对已知未变的字符指针的后续调用,所以我会把这项放在最后(但老实说,我还是会做,因为在相同数据上反复调用strlen()是一种不好的做法)。
  4. 在与OP对话后添加:你反复重新解析同一个数据文件(sales_ok_fraud.txt),每次处理sales_unknwn.txt中的一行数据。如果sales_ok_fraud.txt可以装入内存,那么12GB/每行长度就是大量不必要的重复解析。加载该数据一次,计算其基本统计数据一次,并在剩余的数据处理中使用这些数据和统计信息。

逻辑简化

你可以在一个特定的地方大幅减少工作量,将以下代码:

    if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)        counter_ok_attr2++;    if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)        counter_ok_attr3++;    if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)        counter_ok_attr4++;    if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)        counter_ok_attr5++;    if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)        counter_fraud_attr2++;    if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)        counter_fraud_attr3++;    if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)        counter_fraud_attr4++;    if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)        counter_fraud_attr5++;

改为以下代码:

    if (strcmp(ok_fraud_attr6_str, "ok") == 0)    {        if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0)            counter_ok_attr2++;        if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )            counter_ok_attr3++;        if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0)            counter_ok_attr4++;        if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0)            counter_ok_attr5++;    }    else if (strcmp(ok_fraud_attr6_str,"fraud") == 0)    {        if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0)            counter_fraud_attr2++;        if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0)            counter_fraud_attr3++;        if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0)            counter_fraud_attr4++;        if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0)            counter_fraud_attr5++;    }

前置加载sales_ok_fraud.txt

以下方法依赖于你的sales_ok_fraud.txt统计文件的数据格式的完整性,同时尽可能严格地验证该格式。它分配足够的内存来容纳整个文件加上一个字符,以便将整个内容视为一个以null结尾的字符串。然后通过与你之前类似的通用算法将该缓冲区分成片段。结果将是一个指向固定长度字符指针数组的指针表,然后可以在你当前(并反复)打开、解析、使用和丢弃所有这些内容的地方迭代使用它。

// 声明一个包含六个字符串指针的数组typedef char *OFAttribs[6];// 加载一个由以下格式组成的表://// str1,str2,str3,str4,str5,str6\n// str1,str2,str3,str4,str5,str6\n// ...// str1,str2,str3,str4,str5,str6//// 任何偏离上述格式的情况都将导致循环提前终止,// 但会返回在失败点之前能够解析的所有内容。// 因此调用者应始终`free()`结果表和数据指针。size_t init_ok_fraud_data(const char *fname, OFAttribs **ppTable, char **ppTableData){    if (!fname || !*fname)        return 0;    // 检查文件是否能成功打开    FILE *fp = fopen(fname, "rb");    if (!fp)        return 0;    // 分配足够的内存来容纳整个文件,加上一个终止符    fseek(fp, 0,  SEEK_END);    long len = ftell(fp);    fseek(fp, 0,  SEEK_SET);    // 为整个文件加上终止符分配足够的内存    OFAttribs *pTable = NULL;    size_t nTableLen = 0;    char *pTableData =  malloc((len+1) * sizeof(char));    if (NULL != pTableData)    {        fread(pTableData , len, 1, fp);        pTableData[len] = 0;    }    // 不再需要文件    fclose(fp);    // 初始化第一个标记    char *token = strtok(pTableData, ",");    while (token)    {        // 读取下一行标记        OFAttribs attribs = { NULL };        for (int i=0;i<4 && token; ++i)        {            attribs[i] = token;            token = strtok(NULL, ",");        }        // 填充0..3,设置最后一个标记并继续        if (attribs[3] && token)        {            // 设置倒数第二个条目            attribs[4] = token;            // 行条目仅由换行符终止            token = strtok(NULL, "\n");            if (token)            {                // 正确的格式。6个参数,5个逗号,一个换行符。                attribs[5] = token;                size_t slen = strlen(token);                if (slen > 0)                {                    while (isspace(token[--slen]))                        token[slen] = 0;                }                // 在主列表上为另一个条目分配空间                OFAttribs *tmp = realloc(pTable, sizeof(*tmp) * (nTableLen+1));                if (NULL != tmp)                {                    pTable = tmp;                    memcpy(pTable + nTableLen++, attribs, sizeof(attribs));                }                else                {   // 分配失败。                    printf("Error allocating memory for expanding OKFraud data set");                    exit(EXIT_FAILURE);                }            }            else            {   // 不正确。                printf("Invalid line format detected. Expected ok/fraud\\n");                break;            }            // 下一个新行的标记            token = strtok(NULL, ",");        }    }    // 设置输出变量    *ppTable = pTable;    *ppTableData = pTableData;    return nTableLen;}

整合

将上述所有内容整合到你的代码库中,效果如下:

// 一次性加载ok_fraud表。OFAttribs *okfr = NULL;char *okfr_data = NULL;size_t okfr_len = init_ok_fraud_data("sales_ok_fraud.txt", &okfr, &okfr_data);// 遍历表以确定ok和fraud状态的概率。// 注意:这真的应该作为加载器的一部分来完成。for (size_t i=0;i<okfr_len; ++i){    if (0 == strcmp("ok", okfr[i][5]))        ++counter_ok;    else        ++counter_fraud;}printf("The no. of records with OK label are %f\n",counter_ok);printf("The no. of records with FRAUD label are %f\n",counter_fraud);// 计算ok和fraud状态的概率prblty_ok = (float)counter_ok/(float)(okfr_len);prblty_fraud = (float)counter_fraud/(float)(okfr_len);printf("The probability of OK records is %f\n",prblty_ok);printf("The probability of FRAUD records is %f\n",prblty_fraud);fp=fopen("sales_unknwn.txt","r");fp2=fopen("sales_unknown_ok_classified.txt","w");fp3=fopen("sales_unknown_fraud_classified.txt","w");while(fgets(line1,sizeof(line1),fp)!=NULL) //从文件中读取每一行以计算文件大小。{    char *unknwn_attr1 = strtok(line1,",");    char *unknwn_attr2 = strtok(NULL,",");    char *unknwn_attr3 = strtok(NULL,",");    char *unknwn_attr4 = strtok(NULL,",");    char *unknwn_attr5 = strtok(NULL,",");    //printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);    for (size_t i=0;i<okfr_len; ++i)    {        if( strcmp(okfr[i][5], "ok") == 0 )        {            // ok情况            if( strcmp(unknwn_attr2, okfr[i][1]) == 0 )                counter_ok_attr2++;            if( strcmp(unknwn_attr3, okfr[i][2]) == 0 )                counter_ok_attr3++;            if( strcmp(unknwn_attr4, okfr[i][3]) == 0 )                counter_ok_attr4++;            if( strcmp(unknwn_attr5, okfr[i][4]) == 0 )                counter_ok_attr5++;        }        else // fraud情况        {            if( strcmp(unknwn_attr2, okfr[i][1]) == 0 )                counter_fraud_attr2++;            if( strcmp(unknwn_attr3, okfr[i][2]) == 0 )                counter_fraud_attr3++;            if( strcmp(unknwn_attr4, okfr[i][3]) == 0 )                counter_fraud_attr4++;            if( strcmp(unknwn_attr5, okfr[i][4]) == 0 )                counter_fraud_attr5++;        }    }    if(counter_ok_attr2 == 0)        prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);    else        prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok);    if(counter_ok_attr3 == 0)        prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);    else        prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok);    if(counter_ok_attr4 == 0)        prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);    else        prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok);    if (counter_ok_attr5 == 0)        prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);    else        prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok);    if(counter_fraud_attr2 == 0)        prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);    else        prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud);    if(counter_fraud_attr3 == 0)        prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);    else        prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud);    if(counter_fraud_attr4 == 0)        prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);    else        prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud);    if(counter_fraud_attr5 == 0)        prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);    else        prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud);    total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok;    total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;    //      printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);    //      printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);    //      printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);    //      printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);    //      printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud);    if(total_prblty_ok > total_prblty_fraud)    {        fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);    }    else    {        fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);    }    counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0;    counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;}// 释放表数据和动态指针数组free(okfr);free(okfr_data);fclose(fp);fclose(fp2);fclose(fp3);return 0;

这些只是几个想法。毫无疑问,还有更多可以改进的地方,但这些应该能极大地帮助你以单向扫描和持续输出的方式处理文件,这在这种情况下是尽可能高效的。毫无疑问,结合三大改进:单次文件打开和关闭、逻辑简化以及对sales_ok_fraud.txt文件进行单次解析和缓存,将会极大地提高性能,尤其是前两者和最后一个改进。

编辑 协助OP更新了此处理器,以前置加载sales_ok_fraud.txt文件内容,从而消除了反复加载、解析并立即丢弃15000多行的文本以进行重复解析(每条主源输入行一次)。相应地更新了上面的答案。

Related Posts

L1-L2正则化的不同系数

我想对网络的权重同时应用L1和L2正则化。然而,我找不…

使用scikit-learn的无监督方法将列表分类成不同组别,有没有办法?

我有一系列实例,每个实例都有一份列表,代表它所遵循的不…

f1_score metric in lightgbm

我想使用自定义指标f1_score来训练一个lgb模型…

通过相关系数矩阵进行特征选择

我在测试不同的算法时,如逻辑回归、高斯朴素贝叶斯、随机…

可以将机器学习库用于流式输入和输出吗?

已关闭。此问题需要更加聚焦。目前不接受回答。 想要改进…

在TensorFlow中,queue.dequeue_up_to()方法的用途是什么?

我对这个方法感到非常困惑,特别是当我发现这个令人费解的…

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注