实现音节化算法但实际上很慢

问题描述 投票:0回答:2

我在改进的Lansky算法之后实现了简单的音节化算法,但是当我需要在语料库上运行这个算法超过200万字时它真的很慢。有人能指出我导致它如此缓慢的方向吗?算法如下:

  1. 最后一个元音(元音组)之后的所有内容都属于最后一个音节
  2. 第一个元音(元音组)之前的所有内容都属于第一个音节
  3. 如果元音之间的辅音数是偶数(2n),则将它们分成前半部分属于左元音,第二部分属于右元音(n / n)。
  4. 如果元音之间的辅音数是奇数(2n + 1),我们将它们分成n / n + 1个部分。
  5. 如果元音之间只有一个辅音,则它属于左元音。 #include <stdio.h> #include <string.h> #define VOWELS "aeiou" int get_n_consonant_between(char *word, int length) { int count = 0; int i = 0; while (i++ < length) { if (strchr(VOWELS, *word)) break; word++; count++; } return count; } void syllabification(char *word, int n_vowel_groups) { int i = 0, length = strlen(word), consonants; int syllables = 0, vowel_group = 0, syl_length = 0; char *syllable = word; char hola[length]; memset(hola, 0, length); if (n_vowel_groups < 2) { printf("CAN'T BE SPLIT INTO SYLLABLES\n\n"); return; } while (i < length) { if (strchr(VOWELS, word[i])) { syl_length++; i++; if (vowel_group) continue; vowel_group = 1; } else { if (vowel_group) { consonants = get_n_consonant_between(word + i, length - i); if (consonants == 1) { // printf("only one consonant\n"); syl_length++; strncpy(hola, syllable, syl_length); i++; } else { int count = consonants / 2; if ((consonants % 2) == 0) { /* number of consonants is 2n, first half belongs to the left vowel */ syl_length += count; } else { syl_length += count; } strncpy(hola, syllable, syl_length); i += count; } syllables++; if (syllables == n_vowel_groups) { printf("syllable done %d: %s\n", syllables, syllable); break; } printf("syllable %d: %s\n", syllables, hola); syllable = word + i; syl_length = 0; memset(hola, 0, length); } else { syl_length++; i++; } vowel_group = 0; } } } int count_vowel_groups(char *word) { int i, nvowels = 0; int vowel_group = 0; for (i = 0; i < strlen(word); i++) { if (strchr(VOWELS, word[i])) { if (vowel_group) continue; vowel_group = 1; } else { if (vowel_group) nvowels++; vowel_group = 0; } } // printf("%d vowel groups\n", nvowels); return nvowels; } void repl() { char *line = NULL; size_t len = 0; int i = 0; int count; FILE *file = fopen("../syllables.txt", "r"); while(i++ < 15) { getline(&line, &len, file); printf("\n\n%s\n", line); count = count_vowel_groups(line); syllabification(line, count); } } int main(int argc, char *argv[]) { // printf("Syllabification test:\n"); repl(); } `
c algorithm performance nlp
2个回答
1
投票

这是很多代码,甚至可以检查实现是否正确,主要是因为我不知道算法的术语(比如究竟是元音组)。我查了一下,google给我发了很多研究论文(我只能看到摘要),用于不同语言的音节化,所以我不确定代码是否正确。

但我有一些建议可能会使您的代码更快:

  1. 将你所有的strlen(word)移出for循环条件。将长度保存在变量中并改为使用该变量。所以来自 for (i = 0; i < strlen(word); i++) size_t len = strlen(word); for(i = 0; i < len; i++)
  2. 不要使用strchr检查字符是否为元音。我会使用查找表: // as global variable char vowels[256]; int main(void) { vowels['a'] = 1; vowels['e'] = 1; vowels['i'] = 1; vowels['o'] = 1; vowels['u'] = 1; ... } 当你想检查一个字符是否是一个元音时: // 0x20 | c make c a lower case character if(vowel[0x20 | word[i]]) syl_length++; i++; if (vowel_group) continue; vowel_group = 1; }

第一个建议可能会给你一个小的性能提升,编译器非常聪明,无论如何都可以优化它。第二个建议可能会给你更多的表现,因为它只是一个查找。在最坏的情况下,strchr将不得不多次通过整个"aeiou"阵列

我还建议您分析您的代码。见thisthis


fotenotes

1我做了一个非常粗略的程序,比较了建议的运行时间。我添加了一些额外的代码,希望编译器不会积极地优化函数。

#include <stdio.h>
#include <string.h>
#include <time.h>


int test1(time_t t)
{
    char text[] = "The lazy dog is very lazy";
    for(size_t i = 0; i < strlen(text); ++i)
        t += text[i];

    return t;
}

int test2(time_t t)
{
    char text[] = "The lazy dog is very lazy";
    size_t len = strlen(text);
    for(size_t i = 0; i < len; ++i)
        t += text[i];

    return t;
}

#define VOWELS "aeiou"
char vowels[256];

int test3(time_t t)
{
    char text[] = "The lazy dog is very lazy";
    size_t len = strlen(text);
    for(size_t i = 0; i < len; ++i)
    {
        if (strchr(VOWELS, text[i]))
            t += text[i];
        t += text[i];
    }

    return t;
}

int test4(time_t t)
{
    char text[] = "The lazy dog is very lazy";
    size_t len = strlen(text);
    for(size_t i = 0; i < len; ++i)
    {
        if(vowels[0x20 | text[i]])
            t += text[i];
        t += text[i];
    }

    return t;
}

int main(void)
{
    vowels['a'] = 1;
    vowels['e'] = 1;
    vowels['i'] = 1;
    vowels['o'] = 1;
    vowels['u'] = 1;
    long times = 50000000;

    long tmp = 0;

    clock_t t1 = 0, t2 = 0, t3 = 0, t4 = 0;

    for(long i = 0; i < times; ++i)
    {
        clock_t start,end;
        time_t t = time(NULL);

        start = clock();
        tmp += test1(t);
        end = clock();

        t1 += end - start;
        //t1 += ((double) (end - start)) / CLOCKS_PER_SEC;

        start = clock();
        tmp += test2(t);
        end = clock();

        t2 += end - start;

        start = clock();
        tmp += test3(t);
        end = clock();

        t3 += end - start;

        start = clock();
        tmp += test4(t);
        end = clock();

        t4 += end - start;
    }

    printf("t1: %lf %s\n", ((double) t1) / CLOCKS_PER_SEC, t1 < t2 ? "wins":"loses");
    printf("t2: %lf %s\n", ((double) t2) / CLOCKS_PER_SEC, t2 < t1 ? "wins":"loses");
    printf("t3: %lf %s\n", ((double) t3) / CLOCKS_PER_SEC, t3 < t4 ? "wins":"loses");
    printf("t4: %lf %s\n", ((double) t4) / CLOCKS_PER_SEC, t4 < t3 ? "wins":"loses");
    printf("tmp: %ld\n", tmp);


    return 0;
}

结果是:

$ gcc b.c -ob -Wall -O0
$ ./b 
t1: 10.866770 loses
t2: 7.588057 wins
t3: 10.801546 loses
t4: 8.366050 wins

$ gcc b.c -ob -Wall -O1
$ ./b
t1: 7.409297 loses
t2: 7.082418 wins
t3: 11.415080 loses
t4: 7.847086 wins

$ gcc b.c -ob -Wall -O2
$ ./b
t1: 6.292438 loses
t2: 5.855348 wins
t3: 9.306874 loses
t4: 6.584076 wins

$ gcc b.c -ob -Wall -O3
$ ./b
t1: 6.317390 loses
t2: 5.922087 wins
t3: 9.436450 loses
t4: 6.722685 wins

1
投票

你可以做的事情很少:

1)描述程序并查看它在大多数时间花费的位置。

2)专注于代码的大多数重复部分。

3)避免多次扫描

4)不要进行不必要的操作。例:

一个)

你需要总是memset hola

 memset(hola, 0, length);

在我看来,你可以摆脱它。

b)

for (i = 0; i < strlen(word); i++) {

无需在循环内计算strlen(word)。你可以在外面做:

 int len = strlen(word);
 for (i = 0; i < len; i++) {

您可以从分析中获得非常好的提示,关注它们并放大瓶颈。

© www.soinside.com 2019 - 2024. All rights reserved.