我需要实现一个为通用数据提供合并排序和快速排序算法的库,实现以下函数原型:
void merge_sort(void *base, size_t nitems, size_t size, int (*compar)(const void*, const void*));
void quick_sort(void *base, size_t nitems, size_t size, int (*compar)(const void*, const void*));
地点:
为了进行测试,我将使用一个名为
records.csv
的文件,其中包含 2000 万条要排序的记录。每条记录都在一行上进行描述,并包含以下字段:
格式为标准 CSV:字段以逗号分隔,记录以
\n
分隔。使用之前实现的算法,我需要实现以下函数来根据三个“字段”列中包含的值以非降序对文件中包含的记录进行排序records.csv
:
void sort_records(FILE *infile, FILE *outfile, size_t field, size_t algo);
归并排序和快速排序的算法是这样的:
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
int counter = 0;
// Merge function for Merge Sort
static void merge(void *base, size_t left_size, size_t right_size, size_t size, int (*compar)(const void *, const void *)) {
size_t i, j, k;
// Calculate total size needed for temporary array
size_t total_size = (left_size + right_size) * size;
// Allocate memory for temporary array with the new size
void *temp = malloc(total_size);
if (temp == NULL) {
// Handle memory allocation failure
fprintf(stderr, "Error: Unable to allocate memory for temporary array\n");
exit(EXIT_FAILURE);
}
i = j = k = 0;
while (i < left_size && j < right_size) {
if (compar((char *)base + i * size, (char *)base + (left_size + j) * size) <= 0) {
memcpy((char *)temp + k * size, (char *)base + i * size, size);
i++;
} else {
memcpy((char *)temp + k * size, (char *)base + (left_size + j) * size, size);
j++;
}
k++;
}
while (i < left_size) {
memcpy((char *)temp + k * size, (char *)base + i * size, size);
i++;
k++;
}
while (j < right_size) {
memcpy((char *)temp + k * size, (char *)base + (left_size + j) * size, size);
j++;
k++;
}
memcpy(base, temp, (left_size + right_size) * size);
free(temp);
printf(". %d", counter);
counter++;
}
// Merge Sort Algorithm
void merge_sort(void *base, size_t nitems, size_t size, int (*compar)(const void *, const void *)) {
if (nitems <= 1) {
return;
}
size_t middle = nitems / 2;
merge_sort(base, middle, size, compar);
merge_sort((char *)base + middle * size, nitems - middle, size, compar);
merge(base, middle, nitems - middle, size, compar);
}
// Partition function for Quick Sort
static size_t partition(void *base, size_t nitems, size_t size, int (*compar)(const void *, const void *)) {
void *pivot = malloc(size);
memcpy(pivot, base, size);
size_t i = 1;
size_t j = nitems - 1;
while (1) {
while (i < nitems && compar((char *)base + i * size, pivot) <= 0) {
i++;
}
while (j > 0 && compar((char *)base + j * size, pivot) > 0) {
j--;
}
if (i < j) {
// Swap elements
void *temp = malloc(size);
memcpy(temp, (char *)base + i * size, size);
memcpy((char *)base + i * size, (char *)base + j * size, size);
memcpy((char *)base + j * size, temp, size);
free(temp);
} else {
// Place pivot at correct position
memcpy(base, pivot, size);
free(pivot);
return j;
}
}
}
// Quick Sort Algorithm
static void quick_sort_recursive(void *base, size_t nitems, size_t size, int (*compar)(const void *, const void *)) {
if (nitems <= 1) {
return;
}
size_t pivot_index = partition(base, nitems, size, compar);
if (pivot_index > 0) {
quick_sort_recursive(base, pivot_index, size, compar);
}
if (pivot_index < nitems - 1) {
quick_sort_recursive((char *)base + (pivot_index + 1) * size, nitems - pivot_index - 1, size, compar);
}
}
void quick_sort(void *base, size_t nitems, size_t size, int (*compar)(const void *, const void *)) {
quick_sort_recursive(base, nitems, size, compar);
}
以下代码用于加载记录并使用算法:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include "Esercizio1.h"
#define INITIAL_CAPACITY 2
/* Structure of each record in the .csv file */
struct record {
int id;
char *field1;
int field2;
float field3;
};
/* Takes the read buffer, the element number in the array (i) and the array pointer.
Separates the buffer into tokens and loads the record in the array */
void load_record(char buffer[1024], size_t i, struct record **array) {
struct record *record;
char *token;
record = malloc(sizeof(struct record));
if(record == NULL) {
fprintf(stderr,"load_record: unable to allocate memory for the read record\n");
exit(1);
}
token = strtok(buffer, ",");
record->id = atoi(token);
token = strtok(NULL, ",");
record->field1 = malloc((strlen(token) + 1) * sizeof(char));
strcpy(record->field1, token);
token = strtok(NULL, ",");
record->field2 = atoi(token);
token = strtok(NULL, "\n");
record->field3 = atof(token);
array[i] = record;
}
/* Takes 2 struct record pointers and comapres their respective field1 (string).
It returns:
1 if the first string precedes the second one (in alphabetical order)
0 otherwise */
int compar_field1(const void *r1_p, const void *r2_p) {
struct record *rec1_p, *rec2_p;
if(r1_p == NULL) {
fprintf(stderr, "compar_field1: first parameter is a NULL pointer.\n");
exit(1);
}
if(r2_p == NULL) {
fprintf(stderr, "compar_field1: second parameter is a NULL pointer.\n");
exit(1);
}
rec1_p = (struct record*)r1_p;
rec2_p = (struct record*)r2_p;
return strcmp(rec1_p->field1, rec2_p->field1) < 0;
}
/* Takes 2 struct record pointers and comapres their respective field2 (int).
It returns:
1 if the first int is smaller than the second one
0 otherwise */
int compar_field2(const void *r1_p, const void *r2_p) {
struct record *rec1_p, *rec2_p;
if(r1_p == NULL) {
fprintf(stderr, "compar_field2: first parameter is a NULL pointer.\n");
exit(1);
}
if(r2_p == NULL) {
fprintf(stderr, "compar_field2: second parameter is a NULL pointer.\n");
exit(1);
}
rec1_p = (struct record*)r1_p;
rec2_p = (struct record*)r2_p;
return rec1_p->field2 < rec2_p->field2;
}
/* Takes 2 struct record pointers and comapres their respective field3 (float).
It returns:
1 if the first float is smaller than the second one
0 otherwise */
int compar_field3(const void *r1_p, const void *r2_p) {
struct record *rec1_p, *rec2_p;
if(r1_p == NULL) {
fprintf(stderr, "compar_field3: first parameter is a NULL pointer.\n");
exit(1);
}
if(r2_p == NULL) {
fprintf(stderr, "compar_field3: second parameter is a NULL pointer.\n");
exit(1);
}
rec1_p = (struct record*)r1_p;
rec2_p = (struct record*)r2_p;
return rec1_p->field3 < rec2_p->field3;
}
/* Takes the output file, the ordered array and the number of elements in the array,
prints each record in the .csv output file */
void print_file(FILE *outfile, struct record **array, size_t n_elements) {
size_t i;
for(i = 0; i < n_elements; i++) {
fprintf(outfile, "%d,%s,%d,%f\n", array[i]->id, array[i]->field1, array[i]->field2, array[i]->field3);
}
}
/* Takes the input file, the output file, the field and the chosen sorting algorithm.
It does (in order):
- Creates an empty array with an inistial capacity of 2 elements
- Loads all records inside the array (reallocates double the memory when needed)
- Reallocates memory at the end when all records have been read (it usually saves memory)
- Calls the chosen sorting library
- Prints the output file */
void sort_records(FILE *infile, FILE *outfile, size_t field, size_t algo) {
struct record **array;
struct timespec start, end;
double time_taken;
char buffer[1024];
size_t capacity = 2;
size_t n_elements = 0;
array = (struct record**) malloc(INITIAL_CAPACITY * sizeof(struct record*));
if(array == NULL) {
fprintf(stderr, "sort_records: unable to allocate memory for records\n");
exit(1);
}
clock_gettime(CLOCK_REALTIME, &start);
while(fgets(buffer, 1024, infile) != NULL) {
if(n_elements == capacity) {
array = (struct record**) realloc(array, sizeof(struct record*) * 2 * capacity);
if(array == NULL) {
fprintf(stderr,"sort_records: unable to reallocate memory for records\n");
exit(1);
}
capacity = capacity * 2;
}
load_record(buffer, n_elements, array);
n_elements++;
}
array = (struct record**) realloc(array, sizeof(struct record*) * n_elements);
if(array == NULL) {
fprintf(stderr,"sort_records: unable to reallocate memory for records at the end\n");
exit(1);
}
clock_gettime(CLOCK_REALTIME, &end);
time_taken = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1000000000.0;
printf("1) CSV file loaded in RAM!\n");
printf(" Number of records loaded: %ld\n", n_elements);
printf(" Time elapsed: %f seconds\n", time_taken);
clock_gettime(CLOCK_REALTIME, &start);
if(algo == 1) {
merge_sort(array, n_elements, sizeof(struct record), compar_field1);
} else {
quick_sort(array, n_elements, sizeof(struct record), compar_field1);
}
clock_gettime(CLOCK_REALTIME, &end);
time_taken = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1000000000.0;
printf("2) Records sorted!\n");
printf(" Time elapsed: %f seconds\n", time_taken);
clock_gettime(CLOCK_REALTIME, &start);
print_file(outfile, array, n_elements);
clock_gettime(CLOCK_REALTIME, &end);
time_taken = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1000000000.0;
printf("3) File printed!\n");
printf(" Time elapsed: %f seconds\n", time_taken);
}
/* Takes from command line the 2 file paths + field and algo parameters.
It then performs some checks on the inputs given and, if they all pass,
sort_records() is called. */
int main(int argc, char const *argv[]) {
FILE *infile, *outfile;
size_t field, algo;
if(argc == 5) {
infile = fopen(argv[1], "r");
if(infile == NULL) {
fprintf(stderr, "ERROR! Input file does not exist.\n");
fprintf(stderr, "Terminating...\n");
exit(1);
}
outfile = fopen(argv[2], "w");
if(outfile == NULL) {
fprintf(stderr, "ERROR! Output file can't be created or is already in use.\n");
fprintf(stderr, "Terminating...\n");
exit(1);
}
field = atol(argv[3]);
if(field < 1 || field > 3) {
fprintf(stderr, "ERROR! Field parameter can only be equal to 1, 2 or 3.\n");
fprintf(stderr, "Terminating...\n");
exit(1);
}
algo = atol(argv[4]);
if(algo < 1 || algo > 2) {
fprintf(stderr, "ERROR! Algo parameter can only be equal to 1 or 2.\n");
fprintf(stderr, "Terminating...\n");
exit(1);
}
sort_records(infile, outfile, field, algo);
} else {
fprintf(stderr, "ERROR! Wrong amount of arguments.\n");
fprintf(stderr, "To launch this program correctly, type ./main_ex1 /input_file /output_file field algo\n");
fprintf(stderr, "Terminating...\n");
exit(1);
}
printf("SUCCESS! Terminating...\n");
return 0;
}
该算法看起来不错,但在某些时候它会给我分段错误:
zsh: segmentation fault ./main_ex1 ../records.csv ../out.csv 1 1
问题很可能是由于快速排序中的堆栈溢出造成的。由于该文件不可用,我无法测试代码。这应该可以通过在较小的分区上递归并在较大的分区上循环来解决堆栈溢出问题,但最坏情况的时间复杂度仍然是 O(n^2),在这种情况下排序可能无法在合理的时间内完成。分区应使用中间值作为主元:
// Partition function for Quick Sort
static size_t partition(void *base, size_t nitems, size_t size, int (*compar)(const void *, const void *)) {
void *pivot = malloc(size);
memcpy(pivot, base + ((nitems/2) * size); /* use middle value */
// ...
// Quick Sort Algorithm
static void quick_sort_recursive(void *base, size_t nitems, size_t size, int (*compar)(const void *, const void *)) {
if (nitems <= 1) {
return;
}
while(nitems > 1){ /* recurse on smaller, loop on larger */
size_t pivot_index = partition(base, nitems, size, compar);
if(pivot_index - 0 <= nitems - pivot_index){
quick_sort_recursive(base, pivot_index, size, compar);
base = (char *)base + (pivot_index + 1) * size;
nitems = nitems - pivot_index - 1;
} else {
quick_sort_recursive((char *)base + (pivot_index + 1) * size, nitems - pivot_index - 1, size, compar);
nitems = pivot_index;
}
}
}