用Python将字符串内核封装到scikit SVM分类器中

问题描述 投票:0回答:1

我需要使用的不是一个库,而是一个硬编码的字符串内核来进行文本分类,但我遇到了一些问题,请赐教。https:/github.comhelqpython-ssk。).

import numpy as np

# Kernel defined by Lodhi et al. (2002)
def ssk(s, t, n, lbda, accum=False):
    lens, lent = int(len(s)), int(len(t))
    #n = int(n)
    #k_prim = (-1)*np.ones( (n+1, lens, lent) )
    k_prim = np.zeros( (n, lens, lent) )
    indices = { x : [i for i, e in enumerate(t) if e == x] for x in set(s) }

    k_prim[0,:,:] = 1

    for i in range(1,n):
        for sj in range(i,lens):
            toret = 0.
            for tk in range(i,lent):
                if s[sj-1]==t[tk-1]: # trick taken from shogun implemantion of SSK
                    toret = lbda * (toret + lbda*k_prim[i-1,sj-1,tk-1])
                else:
                    toret *= lbda
                k_prim[i,sj,tk] = toret + lbda * k_prim[i, sj-1, tk]


    start = 0 if accum else n-1
    k = 0.
    for i in range(n):
        for sj in range(i,lens):
            for tk in range(i,lent):
                if s[sj]==t[tk]:
                    k += lbda*lbda*k_prim[i,sj,tk]

    # print( [len(list(i for (sj,tk,i) in k_prim if i==m-1)) for m in range(n)] )
    return k

def string_kernel(xs, ys, n, lbda):
    if len(xs.shape) != 2 or len(ys.shape) != 2 or xs.shape[1] != 1 or ys.shape[1] != 1:
        raise "The shape of the features is wrong, it must be (n,1)"

    lenxs, lenys = xs.shape[0], ys.shape[0]

    mat = np.zeros( (lenxs, lenys) )
    for i in range(lenxs):
        for j in range(lenys):
            mat[i,j] = ssk(xs[i,0], ys[j,0], n, lbda, accum=True)

    mat_xs = np.zeros( (lenxs, 1) )
    mat_ys = np.zeros( (lenys, 1) )

    for i in range(lenxs):
        mat_xs[i] = ssk(xs[i,0], xs[i,0], n, lbda, accum=True)
    for j in range(lenys):
        mat_ys[j] = ssk(ys[j,0], ys[j,0], n, lbda, accum=True)

    return np.divide(mat, np.sqrt(mat_ys.T * mat_xs))

我尝试了他提出的包装器。

def get_ssk_kernel_for_scikit(max_substring, lambda_decay):
    def strker(il,ir):
        #print("Shape of gramm matrix to create ({},{})".format(len(il), len(ir)))
        # assuming that il and ir are lists of strings.
        # len(il) may fail to give you the size real size if you're using np.arrays
        # the idea is to reshape your data to be np.arrays of shapes (n,1) and (m,1)
        l = np.array(il).reshape( (len(il), 1) )
        r = np.array(ir).reshape( (len(ir), 1) )
        return string_kernel(l, r, max_substring, lambda_decay)
    return strker

lambda_decay = 1
max_substring = 2

my_ssk_kernel = get_ssk_kernel_for_scikit(max_substring, lambda_decay)

然后,我用scikit来执行SVM。

xs = ["man", "woman", "women", "men"] 
ys = [0, 1, 1, 0] 

clf = svm.SVC(kernel = my_ssk_kernel)
clf.fit(xs, ys)

然后我得到了一个错误。

ValueError: could not convert string to float: 'man'

我找了好几个小时了,但我没有找到任何解决方案。关于这个问题(如何在scikit-learn中使用字符串内核?)所给的例子对我来说是行不通的。如果你能帮助我,我将非常感激!

python python-3.x machine-learning scikit-learn svm
1个回答
0
投票

我认为你可以把整个克矩阵传给SVC,而不是把它的内核参数设置为 "预计算"。

xs = ["man", "woman", "women", "men"] 
ys = [0, 1, 1, 0]

train_kernel = string_kernel(xs, xs, max_substring, decay)
svc = SVC(kernel='precomputed')
svc.fit(train_kernel, ys)

然后,你必须为任何预测计算类似的克矩阵。

unknowns = ["man", "women"]
test_kernel = string_kernel(unknowns, xs, max_substring, decay)
svc.predict(test_kernel)
© www.soinside.com 2019 - 2024. All rights reserved.