我修改了脚本以获取汉字的字典含义。
在下面的
loadDictData()
方法中,当 localhost 用于获取字典数据文件时,search
返回正确的结果,但是当 cdn 用于获取字典数据文件时,search
方法返回不同的结果。
这是从本地主机加载的所有文件的结果。
git clone https://github.com/cschiller/zhongwen.git
cd zhongwen
python -m http.server
然后打开浏览器并粘贴下面的脚本,结果将是类似的。
这是从 cdn 加载的字典数据 cedict_ts.u8 的结果。使用 cdn url 打开新选项卡,然后将以下脚本粘贴到开发控制台中。结果不一样。
let host = "http://127.0.0.1:8000";
async function loadDictData() {
let wordIndex = fetch(`${host}/data/cedict.idx`).then(r => r.text());
let grammarKeywords = fetch(`${host}/data/grammarKeywordsMin.json`).then(r => r.json());
let vocabKeywords = fetch(`${host}/data/vocabularyKeywordsMin.json`).then(r => r.json());
// comment and uncomment to test the result
host = "https://cdn.jsdelivr.net/gh/cschiller/zhongwen@latest";
let wordDict = fetch(`${host}/data/cedict_ts.u8`).then(r => r.text());
return Promise.all([wordDict, wordIndex, grammarKeywords, vocabKeywords]);
}
class ZhongwenDictionary {
constructor(wordDict, wordIndex, grammarKeywords, vocabKeywords) {
this.wordDict = wordDict;
this.wordIndex = wordIndex;
this.grammarKeywords = grammarKeywords;
this.vocabKeywords = vocabKeywords;
this.cache = {};
}
static find(needle, haystack) {
let beg = 0;
let end = haystack.length - 1;
while (beg < end) {
let mi = Math.floor((beg + end) / 2);
let i = haystack.lastIndexOf('\n', mi) + 1;
let mis = haystack.substr(i, needle.length);
if (needle < mis) {
end = i - 1;
} else if (needle > mis) {
beg = haystack.indexOf('\n', mi + 1) + 1;
} else {
return haystack.substring(i, haystack.indexOf('\n', mi + 1));
}
}
return null;
}
hasGrammarKeyword(keyword) {
return this.grammarKeywords[keyword];
}
hasVocabKeyword(keyword) {
return this.vocabKeywords[keyword];
}
wordSearch(word, max) {
let entry = { data: [] };
let dict = this.wordDict;
let index = this.wordIndex;
let maxTrim = max || 7;
let count = 0;
let maxLen = 0;
WHILE:
while (word.length > 0) {
let ix = this.cache[word];
if (!ix) {
ix = ZhongwenDictionary.find(word + ',', index);
if (!ix) {
this.cache[word] = [];
continue;
}
ix = ix.split(',');
this.cache[word] = ix;
}
for (let j = 1; j < ix.length; ++j) {
let offset = ix[j];
let dentry = dict.substring(offset, dict.indexOf('\n', offset));
if (count >= maxTrim) {
entry.more = 1;
break WHILE;
}
++count;
if (maxLen === 0) {
maxLen = word.length;
}
entry.data.push([dentry, word]);
}
word = word.substr(0, word.length - 1);
}
if (entry.data.length === 0) {
return null;
}
entry.matchLen = maxLen;
return entry;
}
}
async function loadDictionary() {
const [wordDict, wordIndex, grammarKeywords, vocabKeywords] = await loadDictData();
return new ZhongwenDictionary(wordDict, wordIndex, grammarKeywords, vocabKeywords);
}
let dict;
await loadDictionary().then(r => dict = r);
function search(text) {
if (!dict) {
return;
}
let entry = dict.wordSearch(text);
console.log("entry", entry);
if (entry) {
for (let i = 0; i < entry.data.length; i++) {
let word = entry.data[i][1];
if (dict.hasGrammarKeyword(word) && (entry.matchLen === word.length)) {
// the final index should be the last one with the maximum length
entry.grammar = { keyword: word, index: i };
}
if (dict.hasVocabKeyword(word) && (entry.matchLen === word.length)) {
// the final index should be the last one with the maximum length
entry.vocab = { keyword: word, index: i };
}
}
}
return entry;
}
let res = search("你好");
console.log(res);
我无法运行你给的代码,我认为这是编码问题,比如uft8,gb2312。而且我是一个正在学习英语的中国人,我们可以互相教。