如何确定 IEnumerable 的字节大小以获得良好的批量大小?

问题描述 投票:0回答:2

我正在将记录写入 csv 格式,以便通过对上传文件大小有限制的外部 API 将其作为文件上传。我将记录写入内存的方法如下所示:

using CsvHelper;

public async Task<byte[]> WriteToMemoryAsync<T>(IEnumerable<T> recordsToWrite) where T : class
{
    using (var memoryStream = new MemoryStream())
    {
        using (var writer = new StreamWriter(memoryStream))
        using (var csv = new CsvWriter(writer, new CultureInfo("sv-SE")))
        {
            await csv.WriteRecordsAsync(recordsToWrite);

        }
        return memoryStream.ToArray();
    }
}

我目前的批处理方法是这样的:

public async Task<Dictionary<int, byte[]>> BatchWriteToMemoryAsync<T>(IEnumerable<T> recordsToWrite) where T : class
{
    var maxBatchSize = 50_000;

    var nrOfBatches = (int)Math.Ceiling((double)recordsToWrite.Count() / maxBatchSize);

    Dictionary<int, byte[]> records = new();
    for (int batchNr = 0; batchNr < nrOfBatches; batchNr++)
    {
        records.Add(batchNr, await WriteToMemoryAsync<T>(recordsToWrite));
    }

    return records;
}

问题是批量大小有点随意。根据

T
包含的大对象,这可能会失败。

有什么方法可以获取

IEnumerable<T>
的字节大小,以便获得文件大小的近似值,然后据此确定批次数?

还有其他方法可以解决这个问题吗?

c# csv ienumerable csvhelper
2个回答
1
投票

如果改为将所有项目发送到

WriteToMemory
,然后检查流的大小,如果大小符合要求,则返回写入的项目并为下一批初始化新流。所以
WriteToMemory
将返回一组批次。 您可能不想在每条写入记录后刷新,找到合适的刷新间隔。

public static IEnumerable<byte[]> WriteToMemory<T>(IEnumerable<T> recordsToWrite)
{
    var memoryStream = new MemoryStream();
    var writer = new StreamWriter(memoryStream);
    var csv = new CsvWriter(writer, new CultureInfo("sv-SE"));

    try
    {
        foreach (var r in recordsToWrite)
        {
            csv.WriteRecord(r);
            csv.Flush(); //might want to flush after every x items instead of each.
            if (memoryStream.Length >= 1024)
            {
                yield return memoryStream.ToArray();

                memoryStream.Dispose();
                csv.Dispose();
                writer.Dispose();
                memoryStream = new MemoryStream();
                writer = new StreamWriter(memoryStream);
                csv = new CsvWriter(writer, new CultureInfo("sv-SE"));
            }
        }
        
        csv.Flush();
        if (memoryStream.Length > 0)
            yield return memoryStream.ToArray();
    }
    finally
    {
        memoryStream.Dispose();
        csv.Dispose();
        writer.Dispose();
    }
}

-1
投票

您可以使用 sizeof() 来确定每条记录的大小:

你的功能必须是这样的:

public async Task<Dictionary<int, byte[]>> BatchWriteToMemoryAsync<T>(IEnumerable<T> recordsToWrite) where T : class
{
    var maxFileSize = 50_000_000; // maximum file size in bytes
    var recordSize = sizeof(T); // estimated size of each record in bytes

    var totalRecordCount = recordsToWrite.Count();
    var estimatedFileSize = totalRecordCount * recordSize;
    var nrOfBatches = (int)Math.Ceiling((double)estimatedFileSize / maxFileSize);

    var batches = new Dictionary<int, byte[]>();
    for (int batchNr = 0; batchNr < nrOfBatches; batchNr++)
    {
        var startIndex = batchNr * maxFileSize / recordSize;
        var batchSize = Math.Min(totalRecordCount - startIndex, maxFileSize / recordSize);
        var batchRecords = recordsToWrite.Skip(startIndex).Take(batchSize);

        batches.Add(batchNr, await WriteToMemoryAsync<T>(batchRecords));
    }

    return batches;
}
© www.soinside.com 2019 - 2024. All rights reserved.