我正在尝试将 1 和 0 的 16 长度字符数组拆分为 2 个等长整数,以便将 8 位二进制转换为十进制。
例子: 字符*海峡=“0001011011110000” 预期结果:
int s = 00010110;
int t = 11110000;
完整代码: 它的作用:用户输入一串 DNA(例如:ATTCGG)。如果字符串不能被 4 整除,它将使用 strcat() 填充额外的字符。然后它将每个字符转换为新字符数组 xtr[64] 中的两位字符串。然后必须将该数组转换为两个等长的 8 位二进制整数,以转换为代表 DNA 字符串的两个十进制数。基本上 DNA 二进制压缩是作业。
int main()
{
char str[64];
scanf("%s", str);
int obe = strlen(str);
int mod = obe % 4;
if (mod != 0) {
for (int i = mod; i > 0; i--) {
strcat(str, "0");
}
}
int j;
char xtr[64] = "";
for (j = 0; j < strlen(str); j++) {
if (str[j] == 'A') {
strcat(xtr, "0");
strcat(xtr, "0");
} else if (str[j] == 'T') {
strcat(xtr, "0");
strcat(xtr, "1");
} else if (str[j] == 'C') {
strcat(xtr, "1");
strcat(xtr, "0");
} else if (str[j] == 'G') {
strcat(xtr, "1");
strcat(xtr, "1");
} else if (str[j] == '0') {
strcat(xtr, "0");
strcat(xtr, "0");
}
}
int k = strlen(xtr) / 2;
char ret[64];
for (int i = 0; i < k; i++) {
ret[i] = xtr[i];
}
char ter[64];
for (int i = k + 1; i < strlen(xtr); i++) {
ter[i] = xtr[i];
}
int s = atoi(ret);
int t = atoi(ter);
printf("%s", str);
printf("\n");
printf("%s", xtr);
printf("\n");
printf("%d", s);
printf("\n");
printf("%d", t);
}
结果: ATTCGG00 0001011011110000 10110 0
问题:第二个整数没有被正确转换,这段代码很原始。可能需要按位运算符。
#include <stdio.h>
int parseBitChars(char* str, int bitCount) {
int ret = 0;
for (int i = 0; i != bitCount; i++) ret = (ret << 1) | (str[i] == '1' ? 1 : 0);
return ret;
}
int main() {
char* str = "0001011011110000";
// Parse whole string in one go
printf("Value: %d\n", parseBitChars(str, 16)); // Value: 5872
// Or split into bytes
int a = parseBitChars(str, 8);
int b = parseBitChars(str + 8, 8);
printf("Bytes: %d %d\n", a, b); // Bytes: 22 240
}
不需要将 DNA 序列转换为中间字符串。
幸运的是,字母“A”、“C”、“G”和“T”的 ASCII 码在位 1 和位 2 中编码得很好。
'A' = 0bxxxxx00x ==> 0 // 'x' == "don't care"
'C' = 0bxxxxx01x ==> 2
'G' = 0bxxxxx11x ==> 6
'T' = 0bxxxxx10x ==> 4
缺点是传统的“ACGT”交换了最后两个碱基的顺序。
这个“交换”可以通过使用精心制作的 8 位十六进制值的翻译来“取消交换”。
探索以下代码并研究下面的演示字符串:
#include <stdio.h>
void demo( char *p ) { // chunks of bases into registers
puts( p );
while( *p ) {
// unsigned char asBits = 0; // 4 bases/chunk
// unsigned short asBits = 0; // 8 bases/chunk
unsigned int asBits = 0; // 16 bases/chunk
// unsigned long asBits = 0; // 32 bases/chunk
const int pack = sizeof(asBits) * 4;
// The ASCII for each of ACGT is pretty fortunate; can be hashed to two bits 0-3.
// 0xB4: (0b10110100) 4 pairs of bits crafted to correspond to "GTCA" (reversed for shifting.)
// Note that T&G are swapped by that 'magic byte' to conform to conventional "ACGT"
// "AND"ing with 6 masks for the two fortunate bits,
// "0xB4" is right shifted 0, 2, 6 or 4 bits,
// that is then masked (3&) for its lowest two bits.
// 'A'->0b00, 'C'->0b01, 'G'->0b10' and 'T'->0b11
// The accumulator is shifted and this pair OR'd where they belong.
for( int i = pack; *p && i; p++, i-- )
asBits = asBits<<2 | (3 & (0xB4>>(*p&6))); // using one of several mapping functions
// Sequence may not be modulo 16, so tack on extra 0b00 to pad as needed
asBits <<= i+i; // padding for stragglers
// Playback for verification
printf( "%0*X - ", pack/2, asBits );
for( int j = pack+pack-2; j >= 0; j -= 2 )
putchar( "ACGT"[(asBits>>j)&3] );
putchar( '\n' );
}
}
int main( void ) {
/*
Some bonus alternative translation functions
char *cp;
# define M1 "\0\1\3\2"[*cp>>1&3]
# define M2 "\0\0\0\1\3\0\0\2"[*cp&7]
# define M3 3&0x8340>>(*cp<<1&0xF)
# define M4 3&0xB4>>(*cp&6)
char *n = "0123";
for( cp = "ACGT"; *cp; cp++ ) printf( "%c %c%c%c%c\n", *cp, n[M1], n[M2], n[M3], n[M4] );
*/
demo( "TGCTTGCCTGCATGCA" ); // 16 bases
demo( "TTGCTTGCCTGCATGCT" ); // 17 bases
demo( "T" ); // 1-4 bases
demo( "AT" );
demo( "AAT" );
demo( "AAAT" );
// lots of bases
demo( "CATCATCATCATCATCATCATCATCATCATCATCATCATCATCAT" );
return 0;
}
输出演示:
TGCTTGCCTGCATGCA
E7E5E4E4 - TGCTTGCCTGCATGCA
TTGCTTGCCTGCATGCT
F9F97939 - TTGCTTGCCTGCATGC
C0000000 - TAAAAAAAAAAAAAAA
T
C0000000 - TAAAAAAAAAAAAAAA
AT
30000000 - ATAAAAAAAAAAAAAA
AAT
0C000000 - AATAAAAAAAAAAAAA
AAAT
03000000 - AAATAAAAAAAAAAAA
CATCATCATCATCATCATCATCATCATCATCATCATCATCATCAT
4D34D34D - CATCATCATCATCATC
34D34D34 - ATCATCATCATCATCA
D34D34C0 - TCATCATCATCATAAA
玩一会儿。