SQL函数-用Levenshtein距离算法进行模糊匹配-仅返回最低值

问题描述 投票:0回答:1

问题:需要使用Levenshtein算法的SQL函数以返回“最低”匹配值。

代码


CREATE FUNCTION ufn_levenshtein(@s1 nvarchar(3999), @s2 nvarchar(3999))
RETURNS int
AS
BEGIN
 DECLARE @s1_len int, @s2_len int
 DECLARE @i int, @j int, @s1_char nchar, @c int, @c_temp int
 DECLARE @cv0 varbinary(8000), @cv1 varbinary(8000)

 SELECT
  @s1_len = LEN(@s1),
  @s2_len = LEN(@s2),
  @cv1 = 0x0000,
  @j = 1, @i = 1, @c = 0

 WHILE @j <= @s2_len
  SELECT @cv1 = @cv1 + CAST(@j AS binary(2)), @j = @j + 1

 WHILE @i <= @s1_len
 BEGIN
  SELECT
   @s1_char = SUBSTRING(@s1, @i, 1),
   @c = @i,
   @cv0 = CAST(@i AS binary(2)),
   @j = 1

  WHILE @j <= @s2_len
  BEGIN
   SET @c = @c + 1
   SET @c_temp = CAST(SUBSTRING(@cv1, @j+@j-1, 2) AS int) +
    CASE WHEN @s1_char = SUBSTRING(@s2, @j, 1) THEN 0 ELSE 1 END
   IF @c > @c_temp SET @c = @c_temp
   SET @c_temp = CAST(SUBSTRING(@cv1, @j+@j+1, 2) AS int)+1
   IF @c > @c_temp SET @c = @c_temp
   SELECT @cv0 = @cv0 + CAST(@c AS binary(2)), @j = @j + 1
 END

 SELECT @cv1 = @cv0, @i = @i + 1
 END

 RETURN @c
END




IF OBJECT_ID('tempdb..#ExistingCustomers') IS NOT NULL
    DROP TABLE #ExistingCustomers;

    CREATE TABLE #ExistingCustomers
(
    Customer VARCHAR(255),
    ID INT
)

INSERT #ExistingCustomers SELECT 'Ed''s Barbershop',  1002
INSERT #ExistingCustomers SELECT 'GroceryTown',  1003
INSERT #ExistingCustomers SELECT 'Candy Place',  1004
INSERT #ExistingCustomers SELECT 'Handy Man',  1005



IF OBJECT_ID('tempdb..#POTENTIALCUSTOMERS') IS NOT NULL
    DROP TABLE #POTENTIALCUSTOMERS;

CREATE TABLE #POTENTIALCUSTOMERS(Customer VARCHAR(255));

INSERT #POTENTIALCUSTOMERS SELECT 'Eds Barbershop'
INSERT #POTENTIALCUSTOMERS SELECT 'Grocery Town'
INSERT #POTENTIALCUSTOMERS SELECT 'Candy Place'
INSERT #POTENTIALCUSTOMERS SELECT 'Handee Man'
INSERT #POTENTIALCUSTOMERS SELECT 'The Apple Farm'
INSERT #POTENTIALCUSTOMERS SELECT 'Ride-a-Long Bikes'


SELECT A.Customer,
       b.ID,
       b.Customer as cust,
       dbo.ufn_levenshtein(REPLACE(A.Customer, ' ', ''), REPLACE(B.Customer, ' ', '')) as ValueLev
FROM #POTENTIALCUSTOMERS a
     LEFT JOIN #ExistingCustomers b ON dbo.ufn_levenshtein(REPLACE(A.Customer, ' ', ''), REPLACE(B.Customer, ' ', '')) < 15;

此返回:

results

我想返回的内容:

desired results

说明:结果是Levenshtein算法的“最低”值。 Levenshtein分数在The Apple FarmRide-a-Long Bikes相同的两行,在这种情况下,只要是一个值,任何值都可以。

参考

SQL Fuzzy Join - MSSQL

http://www.kodyaz.com/articles/fuzzy-string-matching-using-levenshtein-distance-sql-server.aspx

sql algorithm function
1个回答
0
投票

如果按客户的ID划分分区并使用ValueLev订购结果,则可以使用CTE获得所需的结果:

;WITH CTE AS
(
    SELECT  RANK() OVER (PARTITION BY b.ID ORDER BY dbo.ufn_levenshtein(REPLACE(A.Customer, ' ', ''), REPLACE(B.Customer, ' ', '')) ASC) AS RowNbr,
            A.Customer,
            b.ID,
            b.Customer as cust,
            dbo.ufn_levenshtein(REPLACE(A.Customer, ' ', ''), REPLACE(B.Customer, ' ', '')) as ValueLev
      FROM  #POTENTIALCUSTOMERS a
        LEFT JOIN #ExistingCustomers b ON dbo.ufn_levenshtein(REPLACE(A.Customer, ' ', ''), REPLACE(B.Customer, ' ', '')) < 15
)
SELECT  *
  FROM  CTE
  WHERE CTE.RowNbr = 1

输出:

Eds Barbershop  1002    Ed's Barbershop 1
Grocery Town    1003    GroceryTown     0
Candy Place     1004    Candy Place     0
Handee Man      1005    Handy Man       2
© www.soinside.com 2019 - 2024. All rights reserved.