我正在寻找一个 JavaScript 函数,它可以比较两个字符串并返回它们相似的可能性。我看过 soundex 但对于多单词字符串或非名称来说并不是很好。我正在寻找这样的功能:
function compare(strA,strB){
}
compare("Apples","apple") = Some X Percentage.
该函数适用于所有类型的字符串,包括数字、多字值和名称。也许我可以使用一个简单的算法?
最终这些都没有达到我的目的,所以我用了这个:
function compare(c, u) {
var incept = false;
var ca = c.split(",");
u = clean(u);
//ca = correct answer array (Collection of all correct answer)
//caa = a single correct answer word array (collection of words of a single correct answer)
//u = array of user answer words cleaned using custom clean function
for (var z = 0; z < ca.length; z++) {
caa = $.trim(ca[z]).split(" ");
var pc = 0;
for (var x = 0; x < caa.length; x++) {
for (var y = 0; y < u.length; y++) {
if (soundex(u[y]) != null && soundex(caa[x]) != null) {
if (soundex(u[y]) == soundex(caa[x])) {
pc = pc + 1;
}
}
else {
if (u[y].indexOf(caa[x]) > -1) {
pc = pc + 1;
}
}
}
}
if ((pc / caa.length) > 0.5) {
return true;
}
}
return false;
}
// create object listing the SOUNDEX values for each letter
// -1 indicates that the letter is not coded, but is used for coding
// 0 indicates that the letter is omitted for modern census archives
// but acts like -1 for older census archives
// 1 is for BFPV
// 2 is for CGJKQSXZ
// 3 is for DT
// 4 is for L
// 5 is for MN my home state
// 6 is for R
function makesoundex() {
this.a = -1
this.b = 1
this.c = 2
this.d = 3
this.e = -1
this.f = 1
this.g = 2
this.h = 0
this.i = -1
this.j = 2
this.k = 2
this.l = 4
this.m = 5
this.n = 5
this.o = -1
this.p = 1
this.q = 2
this.r = 6
this.s = 2
this.t = 3
this.u = -1
this.v = 1
this.w = 0
this.x = 2
this.y = -1
this.z = 2
}
var sndx = new makesoundex()
// check to see that the input is valid
function isSurname(name) {
if (name == "" || name == null) {
return false
} else {
for (var i = 0; i < name.length; i++) {
var letter = name.charAt(i)
if (!(letter >= 'a' && letter <= 'z' || letter >= 'A' && letter <= 'Z')) {
return false
}
}
}
return true
}
// Collapse out directly adjacent sounds
// 1. Assume that surname.length>=1
// 2. Assume that surname contains only lowercase letters
function collapse(surname) {
if (surname.length == 1) {
return surname
}
var right = collapse(surname.substring(1, surname.length))
if (sndx[surname.charAt(0)] == sndx[right.charAt(0)]) {
return surname.charAt(0) + right.substring(1, right.length)
}
return surname.charAt(0) + right
}
// Collapse out directly adjacent sounds using the new National Archives method
// 1. Assume that surname.length>=1
// 2. Assume that surname contains only lowercase letters
// 3. H and W are completely ignored
function omit(surname) {
if (surname.length == 1) {
return surname
}
var right = omit(surname.substring(1, surname.length))
if (!sndx[right.charAt(0)]) {
return surname.charAt(0) + right.substring(1, right.length)
}
return surname.charAt(0) + right
}
// Output the coded sequence
function output_sequence(seq) {
var output = seq.charAt(0).toUpperCase() // Retain first letter
output += "-" // Separate letter with a dash
var stage2 = seq.substring(1, seq.length)
var count = 0
for (var i = 0; i < stage2.length && count < 3; i++) {
if (sndx[stage2.charAt(i)] > 0) {
output += sndx[stage2.charAt(i)]
count++
}
}
for (; count < 3; count++) {
output += "0"
}
return output
}
// Compute the SOUNDEX code for the surname
function soundex(value) {
if (!isSurname(value)) {
return null
}
var stage1 = collapse(value.toLowerCase())
//form.result.value=output_sequence(stage1);
var stage1 = omit(value.toLowerCase())
var stage2 = collapse(stage1)
return output_sequence(stage2);
}
function clean(u) {
var u = u.replace(/\,/g, "");
u = u.toLowerCase().split(" ");
var cw = ["ARRAY OF WORDS TO BE EXCLUDED FROM COMPARISON"];
var n = [];
for (var y = 0; y < u.length; y++) {
var test = false;
for (var z = 0; z < cw.length; z++) {
if (u[y] != "" && u[y] != cw[z]) {
test = true;
break;
}
}
if (test) {
//Don't use & or $ in comparison
var val = u[y].replace("$", "").replace("&", "");
n.push(val);
}
}
return n;
}
这是基于 Levenshtein 距离的答案 https://en.wikipedia.org/wiki/Levenshtein_distance
function similarity(s1, s2) {
var longer = s1;
var shorter = s2;
if (s1.length < s2.length) {
longer = s2;
shorter = s1;
}
var longerLength = longer.length;
if (longerLength == 0) {
return 1.0;
}
return (longerLength - editDistance(longer, shorter)) / parseFloat(longerLength);
}
用于计算编辑距离
function editDistance(s1, s2) {
s1 = s1.toLowerCase();
s2 = s2.toLowerCase();
var costs = new Array();
for (var i = 0; i <= s1.length; i++) {
var lastValue = i;
for (var j = 0; j <= s2.length; j++) {
if (i == 0)
costs[j] = j;
else {
if (j > 0) {
var newValue = costs[j - 1];
if (s1.charAt(i - 1) != s2.charAt(j - 1))
newValue = Math.min(Math.min(newValue, lastValue),
costs[j]) + 1;
costs[j - 1] = lastValue;
lastValue = newValue;
}
}
}
if (i > 0)
costs[s2.length] = lastValue;
}
return costs[s2.length];
}
使用方法
similarity('Stack Overflow','Stack Ovrflw')
返回0.8571428571428571
您可以在下面玩它:
function checkSimilarity(){
var str1 = document.getElementById("lhsInput").value;
var str2 = document.getElementById("rhsInput").value;
document.getElementById("output").innerHTML = similarity(str1, str2);
}
function similarity(s1, s2) {
var longer = s1;
var shorter = s2;
if (s1.length < s2.length) {
longer = s2;
shorter = s1;
}
var longerLength = longer.length;
if (longerLength == 0) {
return 1.0;
}
return (longerLength - editDistance(longer, shorter)) / parseFloat(longerLength);
}
function editDistance(s1, s2) {
s1 = s1.toLowerCase();
s2 = s2.toLowerCase();
var costs = new Array();
for (var i = 0; i <= s1.length; i++) {
var lastValue = i;
for (var j = 0; j <= s2.length; j++) {
if (i == 0)
costs[j] = j;
else {
if (j > 0) {
var newValue = costs[j - 1];
if (s1.charAt(i - 1) != s2.charAt(j - 1))
newValue = Math.min(Math.min(newValue, lastValue),
costs[j]) + 1;
costs[j - 1] = lastValue;
lastValue = newValue;
}
}
}
if (i > 0)
costs[s2.length] = lastValue;
}
return costs[s2.length];
}
<div><label for="lhsInput">String 1:</label> <input type="text" id="lhsInput" oninput="checkSimilarity()" /></div>
<div><label for="rhsInput">String 2:</label> <input type="text" id="rhsInput" oninput="checkSimilarity()" /></div>
<div>Match: <span id="output">No Input</span></div>
使用 this 库进行字符串相似度对我来说就像一个魅力!
这是示例 -
var similarity = stringSimilarity.compareTwoStrings("Apples","apple"); // => 0.88
这是一个非常简单的函数,它进行比较并返回基于等价性的百分比。虽然它尚未针对所有可能的场景进行测试,但它可能会帮助您入门。
function similar(a,b) {
var equivalency = 0;
var minLength = (a.length > b.length) ? b.length : a.length;
var maxLength = (a.length < b.length) ? b.length : a.length;
for(var i = 0; i < minLength; i++) {
if(a[i] == b[i]) {
equivalency++;
}
}
var weight = equivalency / maxLength;
return (weight * 100) + "%";
}
alert(similar("test","tes")); // 75%
alert(similar("test","test")); // 100%
alert(similar("test","testt")); // 80%
alert(similar("test","tess")); // 75%
查找两个字符串之间的相似度;我们可以使用不止一两种方法,但我最倾向于使用 'Dice's Coefficient' 。哪个更好!据我所知,比使用 'Levenshtein 距离'
使用 npm 中的“string-similarity”包,您将能够完成我上面所说的工作。
一些简单的使用示例是
var stringSimilarity = require('string-similarity');
var similarity = stringSimilarity.compareTwoStrings('healed', 'sealed');
var matches = stringSimilarity.findBestMatch('healed', ['edward', 'sealed', 'theatre']);
欲了解更多信息,请访问上面给出的链接。谢谢你。
我很快写了一篇可能足以满足您的目的:
大小写相同但不同的字符的权重是完全不同或缺失的字符的四分之一。它返回 0 到 1 之间的数字,1 表示字符串相同。 0 表示它们没有相似之处。例子:
function Compare(strA, strB) {
for (var result = 0, i = strA.length; i--;) {
if (typeof strB[i] == 'undefined' || strA[i] == strB[i]);
else if (strA[i].toLowerCase() == strB[i].toLowerCase())
result++;
else
result += 4;
}
return 1 - (result + 4 * Math.abs(strA.length - strB.length)) / (2 * (strA.length + strB.length));
}
console.log( Compare("Apple", "Apple") );// 1
console.log( Compare("Apples", "Apple") );// 0.8181818181818181
console.log( Compare("Apples", "apple") );// 0.7727272727272727
console.log( Compare("a", "A") );// 0.75
console.log( Compare("Apples", "appppp") );// 0.45833333333333337
console.log( Compare("a", "b") );// 0
来自
PHP.js 库的函数
similar_text
怎么样?
它基于具有相同名称的 PHP 函数。
function similar_text (first, second) {
// Calculates the similarity between two strings
// discuss at: http://phpjs.org/functions/similar_text
if (first === null || second === null || typeof first === 'undefined' || typeof second === 'undefined') {
return 0;
}
first += '';
second += '';
var pos1 = 0,
pos2 = 0,
max = 0,
firstLength = first.length,
secondLength = second.length,
p, q, l, sum;
max = 0;
for (p = 0; p < firstLength; p++) {
for (q = 0; q < secondLength; q++) {
for (l = 0;
(p + l < firstLength) && (q + l < secondLength) && (first.charAt(p + l) === second.charAt(q + l)); l++);
if (l > max) {
max = l;
pos1 = p;
pos2 = q;
}
}
}
sum = max;
if (sum) {
if (pos1 && pos2) {
sum += this.similar_text(first.substr(0, pos2), second.substr(0, pos2));
}
if ((pos1 + max < firstLength) && (pos2 + max < secondLength)) {
sum += this.similar_text(first.substr(pos1 + max, firstLength - pos1 - max), second.substr(pos2 + max, secondLength - pos2 - max));
}
}
return sum;
}
字符串相似性lib vs 最佳答案(作者@overloard1234)性能比较您可以在下面找到
根据 @Tushar Walzade 使用 string-similarity 库的建议,您可以发现,例如
stringSimilatityLib.findBestMatch('KIA','Kia').bestMatch.rating
将返回0.0
所以,看起来最好用小写字母进行比较。
更好的基本用法(对于数组):
findBestMatch(str, strArr) {
const lowerCaseArr = strArr.map(element => element.toLowerCase());//creating lower case array
const match = stringSimilatityLib.findBestMatch(str.toLowerCase(), lowerCaseArr).bestMatch; //trying to find bestMatch
if (match.rating > 0) {
const foundIndex = lowerCaseArr.findIndex(x => x === match.target); //finding the index of found best case
return strArr[foundIndex]; //returning initial value from array
}
return null;
},
性能
另外,我还比较了这里的最佳答案(由@overloard1234制作)和字符串相似性库(v4.0.4)。
您可以在这里找到结果:https://jsbench.me/szkzojoskq/1
结果:字符串相似度快两倍
仅供娱乐:v2.0 的字符串相似度库速度较慢,比最新的 4.0.4 慢约 2.2 倍。如果您仍在使用,请更新它< 3.0 :)
在某种程度上,我喜欢嵌入在string-similarity模块中的Dice系数的想法。但我觉得只考虑二元组而不考虑它们的多重性会丢失一些重要的数据。下面是一个也处理多重性的版本,我认为总体上是一个更简单的实现。我不尝试使用他们的API,只提供一个函数,该函数在进行一些操作后比较两个字符串(删除非字母数字字符,小写所有内容,压缩但不删除空格),构建在一个不进行操作的情况下比较它们的函数之上。将其包装回他们的 API 中很容易,但我认为没什么需要。
const stringSimilarity = (a, b) =>
_stringSimilarity (prep (a), prep (b))
const _stringSimilarity = (a, b) => {
const bg1 = bigrams (a)
const bg2 = bigrams (b)
const c1 = count (bg1)
const c2 = count (bg2)
const combined = uniq ([... bg1, ... bg2])
.reduce ((t, k) => t + (Math .min (c1 [k] || 0, c2 [k] || 0)), 0)
return 2 * combined / (Math .max (bg1 .length + bg2 .length, 1))
}
const prep = (str) => // TODO: unicode support?
str .toLowerCase () .replace (/[^\w\s]/g, ' ') .replace (/\s+/g, ' ')
const bigrams = (str) =>
[...str] .slice (0, -1) .map ((c, i) => c + str [i + 1])
const count = (xs) =>
xs .reduce ((a, x) => ((a [x] = (a [x] || 0) + 1), a), {})
const uniq = (xs) =>
[... new Set (xs)]
console .log (stringSimilarity (
'foobar',
'Foobar'
)) //=> 1
console .log (stringSimilarity (
"healed",
"sealed"
))//=> 0.8
console .log (stringSimilarity (
"Olive-green table for sale, in extremely good condition.",
"For sale: table in very good condition, olive green in colour."
)) //=> 0.7787610619469026
console .log (stringSimilarity (
"Olive-green table for sale, in extremely good condition.",
"For sale: green Subaru Impreza, 210,000 miles"
)) //=> 0.38636363636363635
console .log (stringSimilarity (
"Olive-green table for sale, in extremely good condition.",
"Wanted: mountain bike with at least 21 gears."
)) //=> 0.1702127659574468
console .log (stringSimilarity (
"The rain in Spain falls mainly on the plain.",
"The run in Spun falls munly on the plun.",
)) //=> 0.7560975609756098
console .log (stringSimilarity (
"Fa la la la la, la la la la",
"Fa la la la la, la la",
)) //=> 0.8636363636363636
console .log (stringSimilarity (
"car crash",
"carcrash",
)) //=> 0.8
console .log (stringSimilarity (
"Now is the time for all good men to come to the aid of their party.",
"Huh?",
)) //=> 0
.as-console-wrapper {max-height: 100% !important; top: 0}
一些测试用例来自字符串相似性,其他测试用例是我自己的。它们与该软件包显示出一些显着差异,但没有什么问题。我唯一要指出的是
"car crash"
和 "carcrash"
之间的区别,字符串相似性将其视为相同,并且我报告相似性为 0.8
。我的版本在所有橄榄绿测试用例中发现了比字符串相似性更多的相似性,但由于这些在任何情况下都是相当任意的数字,我不确定它会产生多大的差异;他们当然按照相同的相对顺序放置它们。
fuzzyset - javascript 的模糊字符串集。 fuzzyset 是一种数据结构,它对数据执行类似于全文搜索的操作,以确定可能的拼写错误和近似字符串匹配。请注意,这是 python 库的 javascript 端口。
const str1 = " pARTH PARmar r ";
const str2 = " parmar r par ";
function calculateSimilarity(str1 = "", str2 = "") {
let longer = str1.trim();
let shorter = str2.trim();
let a1 = longer.toLowerCase().split(" ");
let b1 = shorter.toLowerCase().split(" ");
let result = a1.every((aa, i) => aa[0] === b1[i][0]);
if (longer.length < shorter.length) [longer,shorter] = [shorter,longer];
var arr = [];
let count = 0;
for(var i = 0;i<longer.length;i++){
if(shorter && shorter.includes(longer[i])) {
shorter = shorter.replace(longer[i],"")
count++
};
}
return {
score : (count*100)/longer.length,
result
}
}
console.log(calculateSimilarity(str1, str2));
我使用了@overlord1234函数,但更正了
ь: ''
,因为英语单词没有这个字母,接下来需要return a[char] ?? char
而不是return a[char] || char