我正在使用来自这个kaggle的干净数据集作为一个学校项目,我必须用C++编写代码来对数据进行排序并允许我对数据进行统计分析。
https://www.kaggle.com/datasets/elvinrustam/ebay-laptops-and-netbooks-sales/data
我已经做了很多尝试和错误,但我似乎无法弄清楚如何使用 C++ 忽略数据中的逗号和随机引号。我尝试将每个标题中的数据存储到一个向量中,以便我可以在之后对数据进行操作,但是当我打印数据时,我发现数据集中有其他数据和随机逗号、括号、长片段混合在一起文字等
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <string>
#include <set>
std::vector<std::string> parseCSVLine(const std::string& line) {
std::vector<std::string> result;
std::string cell;
bool inQuotes = false;
bool inBulletPoint = false; // New flag to track when we're within a bullet point
for (auto it = line.begin(); it != line.end(); ++it) {
const char nextChar = *it;
// Check for bullet points
if (!inQuotes && *it == '•') {
inBulletPoint = true; // We're now inside a bullet point
cell += nextChar; // Add the bullet point character to the cell
continue;
}
// If we're in a bullet point, check for the end of the line or a comma (end of cell)
if (inBulletPoint && (nextChar == ',' || it == line.end() - 1)) {
inBulletPoint = false; // Exiting bullet point mode
if (nextChar != ',') {
cell += nextChar; // Ensure last character is included if not a comma
}
result.push_back(cell);
cell.clear();
continue;
}
else if (inBulletPoint) {
// Simply add the character to the cell without interpreting it
cell += nextChar;
continue;
}
// Handle quotes (outside of bullet points)
if (nextChar == '"') {
if (inQuotes && (it + 1 != line.end()) && (*(it + 1) == '"')) {
cell += nextChar; // Add a single quote to the cell value
++it; // Skip the next quote
}
else {
inQuotes = !inQuotes;
}
}
else if (nextChar == ',' && !inQuotes) {
result.push_back(cell);
cell.clear();
}
else {
cell += nextChar;
}
}
// Only check the last character if the line is not empty
if (!cell.empty() || (!line.empty() && line.back() == ',')) {
result.push_back(cell);
}
return result;
}
int main() {
std::string filePath = "insert file path here";
std::ifstream file(filePath);
if (!file.is_open()) {
std::cerr << "Failed to open file: " << filePath << std::endl;
return 1;
}
std::string line;
std::vector<std::string> headers;
std::vector<std::vector<std::string>> columnData;
if (getline(file, line)) {
headers = parseCSVLine(line);
columnData.resize(headers.size());
}
while (getline(file, line)) {
auto data = parseCSVLine(line);
for (size_t i = 0; i < data.size() && i < columnData.size(); ++i) {
columnData[i].push_back(data[i]);
}
}
file.close();
//// Example output: Printing unique values for each heading for verification
//for (size_t i = 0; i < headers.size(); ++i) {
// std::set<std::string> uniqueValues(columnData[i].begin(), columnData[i].end());
// std::cout << "Heading: " << headers[i] << " - Unique Values: " << uniqueValues.size() << std::endl;
// for (const auto& value : uniqueValues) {
// std::cout << value << std::endl;
// }
// std::cout << std::endl;
//}
// Make sure to define and fill 'columnData' and 'headers' as per your CSV parsing logic before this snippet
// Here, the index is set to 2 since vector indices are 0-based and we want the third column (heading 3)
size_t index = 2;
// Check if the index is within the bounds of the 'columnData' vector
if (index < columnData.size()) {
std::cout << "Values under Heading 3 (" << headers[index] << "):" << std::endl;
// Iterate over the vector at the given index and print each value
for (const auto& value : columnData[index]) {
std::cout << value << std::endl;
}
}
else {
std::cerr << "Index out of range. The columnData does not have a heading 3." << std::endl;
}
return 0;
}
为了简化答案的假设。
快速浏览 OA 输入数据似乎表明这些因素成立。但如果数据发生变化,您将需要重新验证。
注意:扫描通用 CSV 这些条件可能不成立,您应该使用真正的解析器。但具有这些条件的行为良好的 CSV 文件应该就足够了。
我会把这个问题分成两个问题。
所以我会这样做:
注意:代码未经审查,因此可能存在一些错误。请通过测试验证。
#include <vector>
#include <iostream>
#include <fstream>
#include <sstream>
struct Value
{
std::string value;
friend std::ostream& operator<<(std::ostream& s, Value const& data)
{
s << data.value;
return s;
}
friend std::istream& operator>>(std::istream& s, Value& data)
{
char peek = s.peek();
if (peek != '"') {
std::getline(s, data.value, ','); // drops trailing comma
}
else {
char get1 = s.get(); // gets leading quote
std::getline(s, data.value, '"'); // drops tailing quote
char get2 = s.get(); // gets trailing comma
}
return s;
}
};
struct Line
{
std::vector<Value> values;
friend std::ostream& operator<<(std::ostream& s, Line const& data)
{
for (auto const& v: data.values) {
s << v << ",";
}
return s;
}
friend std::istream& operator>>(std::istream& s, Line& data)
{
std::string line;
std::getline(s, line);
std::stringstream lineStream(std::move(line));
std::vector<Value> tmp{std::istream_iterator<Value>{lineStream}, std::istream_iterator<Value>{}};
if (s) {
data.values = std::move(tmp);
}
return s;
}
};
int main()
{
std::ifstream csv("EbayPcLaptopsAndNetbooksUnclean.csv");
Line line;
int count = 0;
int lineSizeCount[1000] = {0};
while(csv >> line) {
++count;
++lineSizeCount[line.values.size()];
}
std::cout << "Lines: " << count << "\n";
for (int loop = 0; loop < 1000; ++loop) {
if (lineSizeCount[loop] != 0) {
std::cout << " Size: " << loop << " => " << lineSizeCount[loop] << "\n";
}
}
}