在 C 中将字符串拆分为相等的整数

问题描述 投票:0回答:2

我正在尝试将 1 和 0 的 16 长度字符数组拆分为 2 个等长整数,以便将 8 位二进制转换为十进制。

例子: 字符*海峡=“0001011011110000” 预期结果:

int s = 00010110;
int t = 11110000;

完整代码: 它的作用:用户输入一串 DNA(例如:ATTCGG)。如果字符串不能被 4 整除,它将使用 strcat() 填充额外的字符。然后它将每个字符转换为新字符数组 xtr[64] 中的两位字符串。然后必须将该数组转换为两个等长的 8 位二进制整数,以转换为代表 DNA 字符串的两个十进制数。基本上 DNA 二进制压缩是作业。

int main()
{
    char str[64];
    scanf("%s", str);
    int obe = strlen(str);
    int mod = obe % 4;
    if (mod != 0) {
        for (int i = mod; i > 0; i--) {
            strcat(str, "0");
        }
    }
    int j;
    char xtr[64] = "";
    for (j = 0; j < strlen(str); j++) {
        if (str[j] == 'A') {
            strcat(xtr, "0");
            strcat(xtr, "0");
        } else if (str[j] == 'T') {
            strcat(xtr, "0");
            strcat(xtr, "1");
        } else if (str[j] == 'C') {
            strcat(xtr, "1");
            strcat(xtr, "0");
        } else if (str[j] == 'G') {
            strcat(xtr, "1");
            strcat(xtr, "1");
        } else if (str[j] == '0') {
            strcat(xtr, "0");
            strcat(xtr, "0");
        }
    }
    
    int k = strlen(xtr) / 2;
    char ret[64];
    for (int i = 0; i < k; i++) {
        ret[i] = xtr[i];
    }
    
    char ter[64];
    for (int i = k + 1; i < strlen(xtr); i++) {
        ter[i] = xtr[i];  
    }
    int s = atoi(ret);
    int t = atoi(ter);
    printf("%s", str);
    printf("\n");
    printf("%s", xtr);
    printf("\n");
    printf("%d", s);
    printf("\n");
    printf("%d", t);
    
    
}

结果: ATTCGG00 0001011011110000 10110 0

问题:第二个整数没有被正确转换,这段代码很原始。可能需要按位运算符。

arrays c char
2个回答
1
投票
#include <stdio.h>

int parseBitChars(char* str, int bitCount) {
  int ret = 0;
  for (int i = 0; i != bitCount; i++) ret = (ret << 1) | (str[i] == '1' ? 1 : 0);
  return ret;
}

int main() {
  char* str = "0001011011110000";

  // Parse whole string in one go
  printf("Value: %d\n", parseBitChars(str, 16)); // Value: 5872
  
  // Or split into bytes
  int a = parseBitChars(str, 8);
  int b = parseBitChars(str + 8, 8);
  printf("Bytes: %d %d\n", a, b); // Bytes: 22 240
}

0
投票

不需要将 DNA 序列转换为中间字符串。

幸运的是,字母“A”、“C”、“G”和“T”的 ASCII 码在位 1 和位 2 中编码得很好。

'A' = 0bxxxxx00x ==> 0 // 'x' == "don't care"
'C' = 0bxxxxx01x ==> 2
'G' = 0bxxxxx11x ==> 6
'T' = 0bxxxxx10x ==> 4

缺点是传统的“ACGT”交换了最后两个碱基的顺序。

这个“交换”可以通过使用精心制作的 8 位十六进制值的翻译来“取消交换”。

探索以下代码并研究下面的演示字符串:

#include <stdio.h>

void demo( char *p ) { // chunks of bases into registers
    puts( p );
    while( *p ) {
//      unsigned char  asBits = 0; //  4 bases/chunk
//      unsigned short asBits = 0; //  8 bases/chunk
        unsigned int   asBits = 0; // 16 bases/chunk
//      unsigned long  asBits = 0; // 32 bases/chunk
        const int pack = sizeof(asBits) * 4;

        // The ASCII for each of ACGT is pretty fortunate; can be hashed to two bits 0-3.
        // 0xB4: (0b10110100) 4 pairs of bits crafted to correspond to "GTCA" (reversed for shifting.)
        // Note that T&G are swapped by that 'magic byte' to conform to conventional "ACGT"
        // "AND"ing with 6 masks for the two fortunate bits,
        // "0xB4" is right shifted 0, 2, 6 or 4 bits,
        // that is then masked (3&) for its lowest two bits.
        // 'A'->0b00, 'C'->0b01, 'G'->0b10' and 'T'->0b11
        // The accumulator is shifted and this pair OR'd where they belong.

        for( int i = pack; *p && i; p++, i-- )
            asBits = asBits<<2 | (3 & (0xB4>>(*p&6))); // using one of several mapping functions

        // Sequence may not be modulo 16, so tack on extra 0b00 to pad as needed
        asBits <<= i+i; // padding for stragglers

        // Playback for verification
        printf( "%0*X - ", pack/2, asBits );
        for( int j = pack+pack-2; j >= 0; j -= 2 )
            putchar( "ACGT"[(asBits>>j)&3] );
        putchar( '\n' );
    }
}


int main( void ) {
/*
    Some bonus alternative translation functions
    char *cp;
#   define M1 "\0\1\3\2"[*cp>>1&3]
#   define M2 "\0\0\0\1\3\0\0\2"[*cp&7]
#   define M3 3&0x8340>>(*cp<<1&0xF)
#   define M4 3&0xB4>>(*cp&6)
    char *n = "0123";
    for( cp = "ACGT"; *cp; cp++ ) printf( "%c %c%c%c%c\n", *cp, n[M1], n[M2], n[M3], n[M4] );
*/
    demo( "TGCTTGCCTGCATGCA" ); // 16 bases
    demo( "TTGCTTGCCTGCATGCT" ); // 17 bases

    demo( "T" ); // 1-4 bases
    demo( "AT" );
    demo( "AAT" );
    demo( "AAAT" );
    // lots of bases
    demo( "CATCATCATCATCATCATCATCATCATCATCATCATCATCATCAT" );

    return 0;
}

输出演示:

TGCTTGCCTGCATGCA
E7E5E4E4 - TGCTTGCCTGCATGCA

TTGCTTGCCTGCATGCT
F9F97939 - TTGCTTGCCTGCATGC
C0000000 - TAAAAAAAAAAAAAAA

T
C0000000 - TAAAAAAAAAAAAAAA

AT
30000000 - ATAAAAAAAAAAAAAA

AAT
0C000000 - AATAAAAAAAAAAAAA

AAAT
03000000 - AAATAAAAAAAAAAAA

CATCATCATCATCATCATCATCATCATCATCATCATCATCATCAT
4D34D34D - CATCATCATCATCATC
34D34D34 - ATCATCATCATCATCA
D34D34C0 - TCATCATCATCATAAA

玩一会儿。

© www.soinside.com 2019 - 2024. All rights reserved.