如何反映与sklearn的TfidfVectorizer相同的结果?

问题描述 投票:0回答:1

[我正在尝试从头开始构建TfidfVectorizer,我已经建立了与sklearn几乎相同的矢量化器,但是我无法获得与TfidfVectorizer相同的tf-idf分数。

这是我的代码:

def vocab(corpus):
    entire_corpus = ' '.join([i for i in corpus]).split()
    values = Counter(entire_corpus)
    return dict(values)


def tfidf(corpus, vocab):
    row = 0
    vocabs = vocab(corpus)
    for sentence in corpus:
        col = 0
        word_freq = Counter(sentence.split())
        for word, freq in word_freq.items():
            tf = freq/len(sentence)
            n = vocabs.get(word, -1)
            if n != -1:
                idf = 1.0 + math.log((len(corpus)+1)/(n+1))
            print((row, col), tf*idf)
            col = col+1
        row = row + 1


vocabs = vocab(corpus)
tfidf(corpus, vocabs)

第一行的输出是

(0,0)0.038461538461538464

(0,1)0.038461538461538464

(0,2)0.038461538461538464

(0,3)0.05810867783715349

(0,4)0.038461538461538464

而sklearn的TfIDFvectorizer的输出是

(0,8)0.38408524091481483

(0,6)0.38408524091481483

(0,3)0.38408524091481483

(0,2)0.5802858236844359

(0,1)0.46979138557992045

您能告诉我哪里错了吗?谢谢。

python machine-learning scikit-learn tf-idf
1个回答
0
投票
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import numpy as np
import pandas as pd

def tfidf_vectorizer(corpus):

    terms = list(set(' '.join([i for i in corpus]).split()))

    terms.sort()

    matrix = np.zeros((len(corpus), len(terms)))

    for i in range(len(corpus)):

        tf = Counter(corpus[i].split())

        for j in range(len(terms)):

            n = len([document for document in corpus if terms[j] in document])

            idf = (1.0 + np.log((len(corpus) + 1) / (n + 1)))

            matrix[i, j] = tf[terms[j]] * idf

    return (terms, matrix)


corpus = ['this is the first document',
          'this document is the second document',
          'this one is the third']

# manual calculation
vectorizer_1 = tfidf_vectorizer(corpus)

terms_1 = vectorizer_1[0]
matrix_1 = vectorizer_1[1]

# scikit-learn calculation
vectorizer_2 = TfidfVectorizer(norm=None).fit(corpus)

terms_2 = vectorizer_2.get_feature_names()
matrix_2 = vectorizer_2.transform(corpus).toarray()
print(pd.DataFrame(data=matrix_1, columns=terms_1))

   document     first   is       one    second  the     third  this
0  1.287682  1.693147  1.0  0.000000  0.000000  1.0  0.000000   1.0
1  2.575364  0.000000  1.0  0.000000  1.693147  1.0  0.000000   1.0
2  0.000000  0.000000  1.0  1.693147  0.000000  1.0  1.693147   1.0
print(pd.DataFrame(data=matrix_2, columns=terms_2))

   document     first   is       one    second  the     third  this
0  1.287682  1.693147  1.0  0.000000  0.000000  1.0  0.000000   1.0
1  2.575364  0.000000  1.0  0.000000  1.693147  1.0  0.000000   1.0
2  0.000000  0.000000  1.0  1.693147  0.000000  1.0  1.693147   1.0
© www.soinside.com 2019 - 2024. All rights reserved.