cs50 pset 5 的皱眉脸“正确处理最基本的单词”和“拼写检查不区分大小写”

问题描述 投票:0回答:1

所以这段代码花了我很长时间才写出来(也有 YouTube 的指导,因为它太难了),有很多起伏,但在 2 次故障后我终于编译完成了。我运行 check50 并得到一张皱眉的脸,因为“拼写检查不区分大小写”和“正确处理最基本的单词”。我真的不知道应该从哪里开始解决这个问题,也许它与我的哈希函数有关,但我不知道,所以一些帮助会非常好!

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>

#include "dictionary.h"

// Represents a node in a hash table
typedef struct node
{
    char word[LENGTH + 1];
    struct node *next;
}
node;

// Number of words in dictionary
int word_count = 0;

// Number of buckets in hash table
const unsigned int N = 26;

// Hash table
node *table[N];

// Returns true if word is in dictionary, else false
bool check(const char *word)
{
    unsigned int n = hash(word);

    node *cursor = table[n];

    while (cursor != NULL)
    {
        if (strcasecmp(word, cursor -> word) == 0)
        {
            return true;
        }

        cursor = cursor -> next;
    }
    return false;
}

// Hashes word to a number
// Function credit to staff on CS50 reddit page
unsigned int hash(const char *word)
{
    unsigned int hash_value = 0;

    for (int i = 0, n = strlen(word); i < n; i++)
    {
         hash_value = (hash_value << 2) ^ word[i];
    }
    return hash_value % N; 
}

// Loads dictionary into memory, returning true if successful else false
bool load(const char *dictionary)
{
    // Open dictionary and check for memory issue
    //Function guide credit to CS50 Guide by Anvea on YouTube
    FILE *dict = fopen(dictionary, "r");
    char word[LENGTH + 1];

    // Check for memory issue with dict
    if(dict == NULL)
    {
        printf("Dictionary is null\n");
        unload();
        return false;
    }

    while (fscanf(dict, "%s", word) != EOF)
    {
        node *n = malloc(sizeof(node));
        if (n == NULL)
        {
            return false;
        }

        strcpy(n -> word, word);
        word_count++;

        // Index word using hash function
        int dict_index = hash(word);

        // Insert into hash table if already empty
        if (table[dict_index] == NULL)
        {
            n -> next = NULL;
        }
        // Insert work as new node if not empyty
        else
        {
            n -> next = table[dict_index];
        }

        table[dict_index] = n;
    }

    // Close dictionary file
    fclose(dict);

    // Indicate success
    return true;
}

// Returns number of words in dictionary if loaded else 0 if not yet loaded
unsigned int size(void)
{
    return word_count;
    return 0;
}

// Unloads dictionary from memory, returning true if successful else false
bool unload(void)
{
    for (int i = 0; i < N; i++)
    {

        node *cursor = table[i];

        while (cursor)
        {
            node *tmp = cursor;
            cursor = cursor -> next;
            free(tmp);
        }
    }
    return true;
}
c cs50 edx
1个回答
0
投票

一些问题...

  1. 所有字典单词均为小写,每行一个。因此,哈希是在小写字符上完成的。
  2. 但是,检查文本可以是大写的。在
    check
    中,它对混合大小写文本调用
    hash
    ,因此它可以为某些单词生成不同的哈希值(例如)
    Won't this work?
    No, it won't.
  3. check
    中,我们需要在调用hash之前将字符串
    小写,这样我们就可以得到
    same哈希值/索引。作为其有益的副作用,我们可以用 strcasecmp
     替换 
    strcmp
    (更快)。
  4. load
     中,当 
    table[n]NULL
     时,
    no
     不需要有特殊情况,所以我们可以简化它。
  5. 虽然我没有检查这一点,但在
  6. hash
     中执行 
    <<
     可能会产生分布不均匀的哈希值,从而减慢 
    check
     中的搜索速度。
  7. 因为字典文件[保证]全部小写且每行一个条目,所以在
  8. load
     中,我们可以将 
    fscanf
     替换为 
    fgets
     [删除换行符],因为 
    fgets
     速度要快得多。
  9. gcc
     8.3.1 中,编译器标记为 
    node *table[N];
    ,因为: 
    const unsigned int N = 26;
     与 C++ 不同,这在 C 中是不允许的——它需要文字常量,而 
    not 只是 const
    。所以,我把它改成了
    enum { N = 26 };
    
    
  10. 风格注释:从不这样做(例如):x -> y
    而不是
    x->y

这是更正后的代码。我已经对照所有可能的输入文件和字典进行了检查。它注释有错误和修复:

#include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <strings.h> #if 1 #include <ctype.h> #endif #include "dictionary.h" // Represents a node in a hash table typedef struct node { char word[LENGTH + 1]; struct node *next; } node; // Number of words in dictionary int word_count = 0; // Number of buckets in hash table // NOTE/BUG: C (vs C++) doesn't support global arrays from const #if 0 const unsigned int N = 26; #else enum { N = 26 }; #endif // Hash table node *table[N]; // locase -- copy lower cased word void locase(char *dst,const char *src) { for (; *src != 0; ++src, ++dst) *dst = tolower((unsigned char) *src); *dst = 0; } // Returns true if word is in dictionary, else false bool check(const char *word) { // NOTE/FIX: we _must_ take lowercase of word _before_ calculating the hash // otherwise, the hash value will differ for uppercase words here and lowercase // words in the dictionary (i.e. different hash buckets) // NOTE/FIX: we should convert test word to lowercase _once_ (it allows us to // use strcmp below) #if 1 char lobuf[LENGTH + 1]; locase(lobuf,word); word = lobuf; #endif unsigned int n = hash(word); node *cursor = table[n]; while (cursor != NULL) { // NOTE/BUG: strcasecmp is very slow compared to strcmp #if 0 if (strcasecmp(word, cursor->word) == 0) return true; #else if (strcmp(word, cursor->word) == 0) return true; #endif cursor = cursor->next; } return false; } // Hashes word to a number // Function credit to staff on CS50 reddit page unsigned int hash(const char *word) { unsigned int hash_value = 0; #if 0 for (int i = 0, n = strlen(word); i < n; i++) hash_value = (hash_value << 2) ^ word[i]; #endif #if 0 for (int i = 0; word[i] != 0; i++) hash_value = (hash_value << 2) ^ word[i]; #endif #if 1 for (int i = 0; word[i] != 0; i++) hash_value = (hash_value * 31) ^ word[i]; #endif return hash_value % N; } // Loads dictionary into memory, returning true if successful else false bool load(const char *dictionary) { // Open dictionary and check for memory issue // Function guide credit to CS50 Guide by Anvea on YouTube FILE *dict = fopen(dictionary, "r"); #if 0 char word[LENGTH + 1]; #else char word[LENGTH + 10]; #endif // Check for memory issue with dict if (dict == NULL) { printf("Dictionary is null\n"); unload(); return false; } // NOTE/BUG: dictionary is guaranteed to be one word per line and fscanf is // slow compared to fgets #if 0 while (fscanf(dict, "%s", word) != EOF) { #else while (1) { if (fgets(word,sizeof(word),dict) == NULL) break; char *cp = strchr(word,'\n'); if (cp != NULL) *cp = 0; #endif node *n = malloc(sizeof(node)); if (n == NULL) { return false; } strcpy(n->word, word); word_count++; // Index word using hash function int dict_index = hash(word); // NOTE/BUG: no need to special case this #if 0 // Insert into hash table if already empty if (table[dict_index] == NULL) { n->next = NULL; } // Insert work as new node if not empyty else { n->next = table[dict_index]; } #else n->next = table[dict_index]; #endif table[dict_index] = n; } // Close dictionary file fclose(dict); // Indicate success return true; } // Returns number of words in dictionary if loaded else 0 if not yet loaded unsigned int size(void) { return word_count; return 0; } // Unloads dictionary from memory, returning true if successful else false bool unload(void) { for (int i = 0; i < N; i++) { node *cursor = table[i]; while (cursor) { node *tmp = cursor; cursor = cursor->next; free(tmp); } } return true; }


在上面的代码中,我使用

cpp

 条件来表示旧代码与新代码:

#if 0 // old code #else // new code #endif #if 1 // new code #endif
注意:这可以通过运行文件来清理 

unifdef -k



这是清理后的版本(通过

unifdef -k

 进行一些评论清理):

#include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <strings.h> #include <ctype.h> #include "dictionary.h" // Represents a node in a hash table typedef struct node { char word[LENGTH + 1]; struct node *next; } node; // Number of words in dictionary int word_count = 0; // Number of buckets in hash table enum { N = 26 }; // Hash table node *table[N]; // locase -- copy lower cased word void locase(char *dst,const char *src) { for (; *src != 0; ++src, ++dst) *dst = tolower((unsigned char) *src); *dst = 0; } // Returns true if word is in dictionary, else false bool check(const char *word) { char lobuf[LENGTH + 1]; locase(lobuf,word); word = lobuf; unsigned int n = hash(word); node *cursor = table[n]; while (cursor != NULL) { if (strcmp(word, cursor->word) == 0) return true; cursor = cursor->next; } return false; } // Hashes word to a number // Function credit to staff on CS50 reddit page unsigned int hash(const char *word) { unsigned int hash_value = 0; for (int i = 0; word[i] != 0; i++) hash_value = (hash_value * 31) ^ word[i]; return hash_value % N; } // Loads dictionary into memory, returning true if successful else false bool load(const char *dictionary) { // Open dictionary and check for memory issue // Function guide credit to CS50 Guide by Anvea on YouTube FILE *dict = fopen(dictionary, "r"); char word[LENGTH + 10]; // Check for memory issue with dict if (dict == NULL) { printf("Dictionary is null\n"); unload(); return false; } while (1) { if (fgets(word,sizeof(word),dict) == NULL) break; char *cp = strchr(word,'\n'); if (cp != NULL) *cp = 0; node *n = malloc(sizeof(node)); if (n == NULL) { return false; } strcpy(n->word, word); word_count++; // Index word using hash function int dict_index = hash(word); n->next = table[dict_index]; table[dict_index] = n; } // Close dictionary file fclose(dict); // Indicate success return true; } // Returns number of words in dictionary if loaded else 0 if not yet loaded unsigned int size(void) { return word_count; return 0; } // Unloads dictionary from memory, returning true if successful else false bool unload(void) { for (int i = 0; i < N; i++) { node *cursor = table[i]; while (cursor) { node *tmp = cursor; cursor = cursor->next; free(tmp); } } return true; }


虽然上面的代码[现在]是正确的,但还可以进一步改进。

有关一些其他修复和加速,请参阅我的 cs50 拼写器答案:

cs50 pset5 拼写器优化

以下是链接中您的[固定]版本与我的[优化]版本之间

check

功能所用时间的比较:

Yours Mine File 27.900433 0.271936 aca.txt 8.713822 0.093062 austen.txt 1.553954 0.015378 birdman.txt 3.996230 0.041159 burnett.txt 1.927505 0.021139 carroll.txt 0.001437 0.000007 cat.txt 0.477209 0.005412 constitution.txt 12.684350 0.141040 federalist.txt 5.947873 0.060730 frankenstein.txt 6.810451 0.081574 grimm.txt 1.279325 0.013508 her.txt 78.311622 0.861220 holmes.txt 14.265868 0.145593 homer.txt 1.297946 0.013600 lalaland.txt 4.110416 0.042714 mansfield.txt 0.002796 0.000017 pneumonoultramicroscopicsilicovolcanoconiosis.txt 1.739467 0.017283 revenant.txt 5.238748 0.054731 rinehart.txt 68.048034 0.680697 shakespeare.txt 6.071052 0.064508 stein.txt 11.721317 0.121086 stoker.txt 14.137166 0.146902 surgery.txt 40.615153 0.433262 tolstoy.txt 9.731160 0.099112 wells.txt 39.266734 0.457958 whittier.txt 0.014416 0.000132 wordsworth.txt 13.774792 0.144299 xueqin1.txt 18.768864 0.196494 xueqin2.txt
    
© www.soinside.com 2019 - 2024. All rights reserved.