从单词列表中查找所有彼此仅一个字母不同的单词的最快方法

问题描述 投票:0回答:1

我试图找到列表中单词互不相同的所有字母。然后我将它们输出到一个数组中。问题是我当前的代码需要 17 秒才能完成 18, 000 个单词,而我试图减少所需的时间。

有什么办法可以加快速度吗?

#compares words and returns true if they are neighbours
def isneighbour(word1, word2):
    diff = 0
    for i in range(len(word1)):
        if word1[i] != word2[i]:
            diff += 1
            if diff > 1:
                return False
    return diff == 1


def find_neighbours(lst):
    #take a word from lst
    #compare to rest of words from lst
    hood = []
    #convert dictionary to list
    #search for all words of same length
    keylst = list(lst.keys())
    length = len(keylst) -1
    for i in range(0, length -1):
        temp = []
        temp.append(keylst[i])
        for j in range(i + 1, length):
            if len(keylst[i]) == len(keylst[j]):
                #test to see if two words of the same length are neighbours
                if isneighbour(keylst[i], keylst[j]):
                    temp.append(keylst[j])
        if len(temp)>1:
            quicksort(temp)
            hood.append(temp)
    return hood

我用谷歌搜索了一下,但不知道如何改进这一点。我是学生,所以所有建议都会有用。

python
1个回答
0
投票

编辑距离

  • 这取决于您对“彼此相差[一个]字母”的定义。在我看来,字母的顺序确实很重要,这使得问题变得更加复杂。

  • 如果这些是您的假设,我会从使用动态编程的 Levenshtein 开始,然后根据您的预期输出调整我的算法,或者您可以使用它来缩小搜索空间,因为 Levenshtein 距离相当大快。


import random, string, collections


def levenshtein_edit_distance(A, B):
    dp = [[0] * (len(B) + 1) for _ in range(len(A) + 1)]
    for i in range(len(A) + 1):
        dp[i][0] = i
    for j in range(len(B) + 1):
        dp[0][j] = j

    for i in range(1, len(A) + 1):
        for j in range(1, len(B) + 1):
            if A[i - 1] == B[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j - 1], dp[i - 1][j], dp[i][j - 1])

    return dp[-1][-1]


def _gen_words(l):
    return ''.join(random.choice(string.ascii_lowercase) for _ in range(l))


def _modify(word, diffs=1):
    word = list(word)
    for _ in range(diffs):
        i = random.randint(0, len(word) - 1)
        word[i] = random.choice(string.ascii_lowercase)
    return ''.join(word)


words = [_gen_words(3)]

random.seed(10)

for _ in range(100000):
    diffs = random.randint(1, 1)
    w = _modify(words[-1], diffs)
    words.append(w)

unique_words = list(set(words))

memo = collections.defaultdict(list)
for a, b in zip(unique_words[:], unique_words[1:]):
    dist = levenshtein_edit_distance(a, b)
    if dist == 1:
        memo[a] += [b]

print(memo)

打印

defaultdict(, {'sqb': ['sqe'], 'evn': ['lvn'], 'wsr': ['wfr'],'nnl':['cnl'],'乔':['boe'],'qol':['qzl'],'rbm': ['rbk'],'tsj':['tsg'],'sga':['sgt'],'abw':['arw'],'yjl': ['fjl'],'wth':['wah'],'zkn':['ukn'],'shs':['xhs'],'szi': ['czi'],'eps':['cps'],'asb':['hsb'],'ish':['iss'],'owm': ['rwm'],'vae':['vab'],'dou':['doz'],'qeh':['qch'],'wod': ['woj'],'pyg':['ryg'],'ctf':['jtf'],'hjw':['hjq'],'alx': ['clx'],'ikj':['ikp'],'gyr':['健身房'],'pkj':['kkj'],'bcy': ['xcy'],'ebr':['evr'],'gcm':['gcq'],'iqw':['iqx'],'bml': ['bma'],'wrz':['wwz'],'psk':['ksk'],'wtp':['wzp'],'inc': ['inn'],'qbp':['mbp'],'bas':['aas'],'jfx':['jyx'],'osa': ['tsa'],'shn':['太阳'],'lhs':['fhs'],'fzm':['fzu'],'rzc': ['ruc'],'idi':['ido'],'sdw':['sdo'],'碱液':['hye'],'dht': ['cht'],'gjy':['gyy'],'efs':['efb'],'rnb':['rqb'],'gtw': ['rtw'],'pgf':['pgn'],'yhz':['ycz'],'rre':['rce'],'box': ['fox'], 'zql': ['zqn'], 'wmc': ['wmx'], 'tny': ['tey'], 'ayy': ['azy'],'jka':['jkn'],'ost':['ist'],'ktc':['kcc'],'zxl': ['zxo'],'jfs':['jvs'],'jgy':['rgy'],'tql':['tel'],'yne': ['yje'],'mpa':['ypa'],'uwg':['ufg'],'uec':['uee'],'ppr': ['rpr'],'eln':['egn'],'opp':['opk'],'rip':['jip'],'zge': ['fge'],'jfb':['jft'],'jcu':['jcq'],'ngc':['ngq'],'vrw': ['vcw'],'uml':['fml'],'lez':['lmz'],'nhy':['uhy'],'nuz': ['nuj'],'wjj':['wjg'],'bqa':['uqa'],'kil':['kix'],'ymj': ['yxj'],'bqg':['bhg'],'lop':['lok'],'fev':['fei'],'fjp': ['fje'],'wzw':['wiw'],'ayp':['anp'],'xvf':['yvf'],'ptw': ['pte'], 'cts': ['ats']})

© www.soinside.com 2019 - 2024. All rights reserved.