此函数以某种方式避免了 ascii 特殊字符,例如空格、换行符或 ' '等。它根据该实现给了我错误的输出,这意味着这是这个标志的严格问题。
这是压缩代码:
std::vector<Token> compression_lz77(const std::string &input, int searchBuffer, int lookAheadBuffer)
{
int inputLength = input.length();
int position = 0;
std::vector<Token> data;
while (position < inputLength)
{
Token token{};
token.offset = 0;
token.length_of_match = 0;
token.code_word = input[position];
int max_offset = (position < searchBuffer) ? position : searchBuffer;
int max_search_length = (position + lookAheadBuffer) > inputLength ? inputLength - position : lookAheadBuffer;
for (int offset = 1; offset <= max_offset; offset++)
{
int len = 0;
while (len < max_search_length && input[position - offset + len] == input[position + len])
{
len++;
}
if (len > token.length_of_match)
{
token.offset = offset;
token.length_of_match = len;
token.code_word = input[position + len];
}
}
data.push_back(token);
position += token.length_of_match + 1;
}
return data;
}
对于我的输入,我使用:
The quick brown fox jumps over the lazy dog.
ABCABCABCDABCDEFABCDEFGABCDEFGHABCDEFGHI
压缩输出:
(0,0,T)(0,0,h)(0,0,e)(0,0, )(0,0,q)(0,0,u)(0,0,i)(0,0,c)(0,0,k)(6,1,b)(0,0,r)(0,0,o)(0,0,w)(0,0,n)(6,1,f)(5,1,x)(4,1,j)(16,1,m)(0,0,p)(0,0,s)(6,1,o)(0,0,v)(26,1,r)(5,1,t)(31,3,l)(0,0,a)(0,0,z)(0,0,y)(5,1,d)(15,1,g)(0,0,.)(0,0,
)(0,0,
)(0,0,A)(0,0,B)(0,0,C)(3,6,D)(4,4,E)(0,0,F)(6,6,G)(7,7,H)(8,8,I)
正如你所看到的,有一堆空的三重奏(我的意思是<0,0,null>,右括号奇怪地转移到新行,我猜暗示着一个新行字符,但编码很奇怪。
使用解压缩算法(我认为该算法已正确实现),我将字符串重新创建到输出文件:
The)quick)brown)fox)jumps)over)the)lazy)dog.))ABCABCABCDABCDEFABCDEFGABCDEFGHABCDEFGHI
有人可以帮我如何管理这些特殊字符吗?
代币结构:
struct Token
{
int offset;
int length_of_match;
char code_word;
};
减压功能:
std::string decompression_lz77(const std::vector<Token> &compressedData)
{
std::string decompressed;
for (const Token &token : compressedData)
{
if (token.offset == 0)
{
decompressed += token.code_word;
}
else
{
int startPos = decompressed.length() - token.offset;
int endPos = startPos + token.length_of_match;
for (int i = startPos; i < endPos; ++i)
{
decompressed += decompressed[i];
}
decompressed += token.code_word;
}
}
return decompressed;
}
还有solve函数,它可以被视为main函数,因为在main函数中我只解析来自命令行的函数和参数:
int solve(const CompressionParams ¶ms)
{
if (params.inputFileName.empty() || params.outputFileName.empty() || params.mode.empty() || params.inputBufferSize <= 0 || params.historyBufferSize <= 0)
{
std::cout << "Niepoprawne parametry linii polecen. Prosze podac wszystkie wymagane opcje." << std::endl;
printInstructions();
return 1;
}
std::ifstream inputFile(params.inputFileName, std::ios::binary);
if (!inputFile.is_open())
{
std::cerr << "Blad podczas otwierania pliku wejsciowego: " << params.inputFileName << std::endl;
return 1;
}
inputFile.seekg(0, std::ios::end);
std::streampos fileSize = inputFile.tellg();
inputFile.seekg(0, std::ios::beg);
std::vector<char> fileContent(fileSize);
inputFile.read(fileContent.data(), fileSize);
inputFile.close();
std::string data(fileContent.begin(), fileContent.end());
std::vector<Token> arr;
if (params.mode == "c")
{
if (!fileContent.empty())
{
arr = compression_lz77(data, params.inputBufferSize, params.historyBufferSize);
std::ofstream outputFile(params.outputFileName);
if (!outputFile.is_open())
{
std::cerr << "Wystapil blad podczas otwierania pliku wyjsciowego." << std::endl;
return 1;
}
for (const auto &token : arr)
{
outputFile << "<" << token.offset << "," << token.length_of_match << "," << token.code_word << ">";
}
outputFile.close();
auto startingSize = static_cast<double>(fileContent.size());
double compressedSize = static_cast<double>(arr.size()) * (sizeof(Token) / sizeof(char));
double wspolczynnik = (startingSize / compressedSize) * 100.0;
std::cout << "Teoretyczny wspolczynnik kompresji: " << wspolczynnik << "%" << std::endl;
std::cout << "Teoretyczny stopien kompresji " << (compressedSize / startingSize) << std::endl;
}
}
else if (params.mode == "d")
{
std::ofstream outputFile(params.outputFileName, std::ios::binary);
if (!outputFile.is_open())
{
std::cerr << "Wystapil blad podczas otwierania pliku wejsciowego." << std::endl;
return 1;
}
std::string input(fileContent.begin(), fileContent.end());
std::vector<Token> compressed_data;
size_t pos = 0;
while (pos < input.size())
{
size_t nextPos = input.find('>', pos);
if (nextPos == std::string::npos)
{
break;
}
std::string tokenStr = input.substr(pos, nextPos - pos + 1);
Token t{};
std::istringstream tokenStream(tokenStr);
char dummy;
tokenStream >> dummy >> t.offset >> dummy >> t.length_of_match >> dummy >> t.code_word;
compressed_data.push_back(t);
pos = nextPos + 1;
}
for (const Token &token : compressed_data)
{
std::cout << '<' << token.offset << ',' << token.length_of_match << ',' << token.code_word << '>';
}
std::cout << std::endl;
std::string decompressed_output = decompression_lz77(compressed_data);
outputFile.write(decompressed_output.c_str(), decompressed_output.size());
outputFile.close();
}
else
{
std::cerr << "Niepoprawny tryb. Proszę uzyc 'c' dla kompresji lub 'd' dla dekompresji." << std::endl;
printInstructions();
return 1;
}
return 0;
}
所需的一切都包括在内,例如#string、#vector 等。
将其解析为值:searchBuffer:256,lookAheadBuffer:4096
问题出在这一行:
tokenStream >> dummy >> t.offset >> dummy >> t.length_of_match >> dummy >> t.code_word;
operator>>(std::basic_istream)
上的 cppreference:它是一个格式化输入函数,它跳过空白字符。因此,它不会将这些空白字符提取到 t.code_word
中。相反,它会获取随后的下一个非空白字符 ('>'
)。换句话说,>>
不会将空格或换行符(即所有被视为空白的字符)提取到char
中。
也许,由于您已经将
offset
和 length_of_match
值作为数字的文本表示而不是原始字节值写入“压缩”输出文件中,因此您还想将 code_word
值写入为数字,而不是字符?
例如,替换这一行
outputFile << "<" << token.offset << "," << token.length_of_match << "," << token.code_word << ">";
这样:
outputFile << "<" << token.offset << "," << token.length_of_match << "," << static_cast<int>(token.code_word) << ">";
然后替换这一行
tokenStream >> dummy >> t.offset >> dummy >> t.length_of_match >> dummy >> t.code_word;
这样:
int temp = 0;
tokenStream >> dummy >> t.offset >> dummy >> t.length_of_match >> dummy >> temp;
t.code_word = static_cast<char>(temp);
另外,替换这一行
std::cout << '<' << token.offset << ',' << token.length_of_match << ',' << token.code_word << '>';
这样:
std::cout << '<' << token.offset << ',' << token.length_of_match << ',' << static_cast<int>(token.code_word) << '>';
当然,“压缩”文件将包含字符的代码而不是原始字符,但解压应该成功。