使用apriori的单词序列

问题描述 投票:0回答:1

我已经实现了Apriori以找到重复的字母序列,但我想要做的是找到重复的单词。我的输出给了我所有的字母数。但是,我想要数字;

i / p data = [“我是唐纳德特朗普”,“我是唐纳德鸭子”]

o / p - > {'d':3,'d':3,'on':3,'ld':3,'am':3,'do':3,'al':3,'呐'}

- >我想要的 - >我:2

上午2点

特朗普:2

唐纳德:3

我是:2

我是唐纳德:2

唐纳德特朗普:1

我是唐纳德:2

import re
import unittest
from collections import defaultdict
import itertools

class Apriori(dict):

    def __init__(self, listOfSequences, support):

        Args:
            listOfSequences (list): A list of strings, each letter representing a specific event.
            support (int): The minimum percentage of sequences a pattern must match.
        """

        super(Apriori, self).__init__()
        self.data = listOfSequences
        self.thres = (support * len(self.data)) / 100.0
        self.primitives = self.getPrimitives()
        self.apriori()
        del self.data

    def apriori(self):
        candidates = self.getNewCandidates(self.primitives)
        while len(candidates) > 0:
            res = self.getPatternsCount(candidates)
            self.update(res)
            candidates = self.getNewCandidates(res.keys())

    def getPrimitives(self):
        primitives = set()
        for seq in self.data:
            for event in seq:
                primitives.add(event)
        return primitives

    def getNewCandidates(self, candidates):
        newCandidates = set()
        for seq in self.data:
            for can in candidates:
                for subs in re.findall(can + ".", seq):
                    newCandidates.add(subs)
        return newCandidates

    def getPatternsCount(self, candidates):
        patternsCount = defaultdict(int)
        for seq in self.data:
            for can in candidates:
                if can in seq:
                    patternsCount[can] += 1
        return {k: v for k, v in patternsCount.items() if v > self.thres}


    if __name__ == '__main__':
        pass

跑:

import csv
from ne import *

#print(t)
data = ["i am donald trump","i am donald duck"]
#print(data.type())

patterns= Apriori(data,15)

print(patterns)
python apriori
1个回答
0
投票

专注于您的特定问题(并忽略您发布的代码中的一些问题),您可以通过简单地分割输入字符串并保持这些拆分令牌的频率计数来实现您的目标。

例如,使用Counter

from collections import Counter

data = ["i am donald trump","i am donald duck"]
c = Counter()
for seq in data:
    c += Counter(seq.split(' '))

print c  # Counter({'i': 2, 'donald': 2, 'am': 2, 'trump': 1, 'duck': 1})

附:然后你可能不会对most_common值感兴趣,但是在超过某个阈值的那些值中,例如:

print {(k,v) for k,v in c.iteritems() if v >= 2}  # {('am', 2), ('donald', 2), ('i', 2)}
© www.soinside.com 2019 - 2024. All rights reserved.