问题:需要使用Levenshtein算法的SQL函数以返回“最低”匹配值。
代码:
CREATE FUNCTION ufn_levenshtein(@s1 nvarchar(3999), @s2 nvarchar(3999))
RETURNS int
AS
BEGIN
DECLARE @s1_len int, @s2_len int
DECLARE @i int, @j int, @s1_char nchar, @c int, @c_temp int
DECLARE @cv0 varbinary(8000), @cv1 varbinary(8000)
SELECT
@s1_len = LEN(@s1),
@s2_len = LEN(@s2),
@cv1 = 0x0000,
@j = 1, @i = 1, @c = 0
WHILE @j <= @s2_len
SELECT @cv1 = @cv1 + CAST(@j AS binary(2)), @j = @j + 1
WHILE @i <= @s1_len
BEGIN
SELECT
@s1_char = SUBSTRING(@s1, @i, 1),
@c = @i,
@cv0 = CAST(@i AS binary(2)),
@j = 1
WHILE @j <= @s2_len
BEGIN
SET @c = @c + 1
SET @c_temp = CAST(SUBSTRING(@cv1, @j+@j-1, 2) AS int) +
CASE WHEN @s1_char = SUBSTRING(@s2, @j, 1) THEN 0 ELSE 1 END
IF @c > @c_temp SET @c = @c_temp
SET @c_temp = CAST(SUBSTRING(@cv1, @j+@j+1, 2) AS int)+1
IF @c > @c_temp SET @c = @c_temp
SELECT @cv0 = @cv0 + CAST(@c AS binary(2)), @j = @j + 1
END
SELECT @cv1 = @cv0, @i = @i + 1
END
RETURN @c
END
IF OBJECT_ID('tempdb..#ExistingCustomers') IS NOT NULL
DROP TABLE #ExistingCustomers;
CREATE TABLE #ExistingCustomers
(
Customer VARCHAR(255),
ID INT
)
INSERT #ExistingCustomers SELECT 'Ed''s Barbershop', 1002
INSERT #ExistingCustomers SELECT 'GroceryTown', 1003
INSERT #ExistingCustomers SELECT 'Candy Place', 1004
INSERT #ExistingCustomers SELECT 'Handy Man', 1005
IF OBJECT_ID('tempdb..#POTENTIALCUSTOMERS') IS NOT NULL
DROP TABLE #POTENTIALCUSTOMERS;
CREATE TABLE #POTENTIALCUSTOMERS(Customer VARCHAR(255));
INSERT #POTENTIALCUSTOMERS SELECT 'Eds Barbershop'
INSERT #POTENTIALCUSTOMERS SELECT 'Grocery Town'
INSERT #POTENTIALCUSTOMERS SELECT 'Candy Place'
INSERT #POTENTIALCUSTOMERS SELECT 'Handee Man'
INSERT #POTENTIALCUSTOMERS SELECT 'The Apple Farm'
INSERT #POTENTIALCUSTOMERS SELECT 'Ride-a-Long Bikes'
SELECT A.Customer,
b.ID,
b.Customer as cust,
dbo.ufn_levenshtein(REPLACE(A.Customer, ' ', ''), REPLACE(B.Customer, ' ', '')) as ValueLev
FROM #POTENTIALCUSTOMERS a
LEFT JOIN #ExistingCustomers b ON dbo.ufn_levenshtein(REPLACE(A.Customer, ' ', ''), REPLACE(B.Customer, ' ', '')) < 15;
此返回:
我想返回的内容:
说明:结果是Levenshtein算法的“最低”值。 Levenshtein分数在The Apple Farm
和Ride-a-Long Bikes
相同的两行,在这种情况下,只要是一个值,任何值都可以。
参考:
http://www.kodyaz.com/articles/fuzzy-string-matching-using-levenshtein-distance-sql-server.aspx
如果按客户的ID划分分区并使用ValueLev订购结果,则可以使用CTE获得所需的结果:
;WITH CTE AS
(
SELECT RANK() OVER (PARTITION BY b.ID ORDER BY dbo.ufn_levenshtein(REPLACE(A.Customer, ' ', ''), REPLACE(B.Customer, ' ', '')) ASC) AS RowNbr,
A.Customer,
b.ID,
b.Customer as cust,
dbo.ufn_levenshtein(REPLACE(A.Customer, ' ', ''), REPLACE(B.Customer, ' ', '')) as ValueLev
FROM #POTENTIALCUSTOMERS a
LEFT JOIN #ExistingCustomers b ON dbo.ufn_levenshtein(REPLACE(A.Customer, ' ', ''), REPLACE(B.Customer, ' ', '')) < 15
)
SELECT *
FROM CTE
WHERE CTE.RowNbr = 1
输出:
Eds Barbershop 1002 Ed's Barbershop 1
Grocery Town 1003 GroceryTown 0
Candy Place 1004 Candy Place 0
Handee Man 1005 Handy Man 2