遗传序列（三联体）

Question

我必须编写一个程序来读取字符串并输出字母 A、T、C、G 的所有遗传序列（三联体），这些字母在整个字符串中重复次数最多。如果某些三元组重复相同的次数，则输出字符串中顺序第一个的三元组。如果字符串中除 A、T、C、G 之外还有任何其他字符或字母，则程序应打印错误并结束。我的代码通常可以正常工作并且在排序方面表现良好，但有些输入却不能，我似乎无法弄清楚为什么。

对于这些它不起作用，我在这段代码中对它们进行了硬编码，这样我就可以指出它们：

if (strncmp(input, "CTA", 3) == 0) {
        printf("GCT\n");
        return 0;
    }


    for (int i = 0; input[i + 17] != '\0'; i++) {
        if (strncmp(&input[i], "AAAAACCCCCGGGGGTCTTCT", 20) == 0) {
            printf("TCT\n");
            return 0;
        }
    }

这是主要代码

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_LENGTH 1000

int isValid(char ch) {
    return (ch == 'A' || ch == 'T' || ch == 'C' || ch == 'G');
}

void markLettersUsed(int usedIndices[], int start, int end);

int isLetterUsed(int usedIndices[], int index) {
    for (int i = 0; usedIndices[i] != -1; i++) {
        if (usedIndices[i] == index) {
            return 1;
        }
    }
    return 0;
}

int main() {
    FILE *file;
    char filename[] = "genetski_kod.txt";
    char input[MAX_LENGTH];

    file = fopen(filename, "r");
    if (file == NULL) {
        printf("Pogresan format\n");
        return 0;

    }

    int ch;
    int i = 0;

    if ((ch = fgetc(file)) == EOF) {
        printf("Pogresan format\n");
        fclose(file);
        return 0;
    }

    fseek(file, 0, SEEK_SET);

    while ((ch = fgetc(file)) != EOF && i < sizeof(input) - 1) {
        if (!isValid(ch)) {
            printf("Pogresan format\n");
            fclose(file);
            return 0;
        }
        input[i++] = ch;
    }

    input[i] = '\0';

    fclose(file);

    if (strncmp(input, "CTA", 3) == 0) {
        printf("GCT\n");
        return 0;
    }

    int maxCount = 0;
    int mostFrequentTripletStartIndex = -1;

     

    for (int i = 0; input[i + 17] != '\0'; i++) {
        if (strncmp(&input[i], "AAAAACCCCCGGGGGTCTTCT", 20) == 0) {
            printf("TCT\n");
            return 0;
        }
    }

    int usedIndices[MAX_LENGTH];

    for (int i = 0; input[i + 2] != '\0'; i++) {
        int count = 1;

        memset(usedIndices, -1, sizeof(usedIndices));

        for (int j = i + 3; input[j - 2] != '\0' && input[j - 1] != '\0' && input[j] != '\0' && input[j + 1] != '\0'; j++) {
            if (input[i] == input[j - 2] && input[i + 1] == input[j - 1] && input[i + 2] == input[j]) {
                if (!isLetterUsed(usedIndices, j - 2) && !isLetterUsed(usedIndices, j - 1) && !isLetterUsed(usedIndices, j)) {
                    count++;
                    markLettersUsed(usedIndices, j - 2, j);
                }
            }
        }

        if (count > maxCount) {
            maxCount = count;
            mostFrequentTripletStartIndex = i;
        }
    }

    if (maxCount > 0) {
        printf("%c%c%c\n", input[mostFrequentTripletStartIndex],
               input[mostFrequentTripletStartIndex + 1],
               input[mostFrequentTripletStartIndex + 2]);
    } else {
        printf("Pogresan format\n");
    }

    return 0;
}

void markLettersUsed(int usedIndices[], int start, int end) {
    for (int i = start; i <= end; i++) {
        usedIndices[i] = i;
    }
}

这是该程序的测试之一：


FILE* file = fopen("genetic_code.txt", "w");
fputs("AAAAACCCCGGGGGGTCTTCTAAA", file);
fclose(file);
_main();

file = fopen("genetic_code.txt", "w");
fputs("AAAAACCCCCGGGGGTCTTCT", file);
fclose(file);
_main();

/* Program Input/Output */
Expected output(s):
GGG
TCT

Or:
GGGTCT

Your program output:
*blank*

Answer 1

当我使用包含第一个基因测试字符串的测试文件测试您的程序时，我首先改进了诊断输出，以更好地定位问题发生的位置。问题发生在字符读取循环中。

    while ((ch = fgetc(file)) != EOF && i < sizeof(input) - 1)
    {
        if (!isValid(ch))
        {
            printf("Char - Pogresan format: %d\n", ch);
            fclose(file);
            return 0;
        }
        input[i++] = ch;
    }

添加有关可疑字符的信息，突出显示字符的读取遇到“换行”字符（ASCII 值“10”）。

craig@Vera:~/C_Programs/Console/DNA/bin/Release$ ./DNA 
Char - Pogresan format: 10

添加忽略“换行”字符的测试，我能够获得 DNA 模式输出。

    while ((ch = fgetc(file)) != EOF && i < sizeof(input) - 1)
    {
        if (ch == '\n')     /* Skip newline character */
            continue;
        if (!isValid(ch))
        {
            printf("Char - Pogresan format: %d\n", ch);
            fclose(file);
            return 0;
        }
        input[i++] = ch;
    }

然后，第一个测试字符串被输入到文本文件中。

AAAAACCCCGGGGGGTCTTCTAAA

craig@Vera:~/C_Programs/Console/DNA/bin/Release$ ./DNA 
GGG

然后，进入第二次测试，重新运行程序。

AAAAACCCCCGGGGGTCTTCT

craig@Vera:~/C_Programs/Console/DNA/bin/Release$ ./DNA 
TCT

主要要点是要注意，当遇到格式和行控制字符时，您的测试可能需要容错。

遗传序列（三联体）

问题描述投票：0回答：1

1个回答

最新问题

遗传序列（三联体）

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1