Lucene.net搜索:返回具有唯一字段值的结果

问题描述 投票:0回答:1

我正在编写一个基本的 Lucene.Net 应用程序来索引本质上是论坛帖子的内容。为了简化,每个

Post
文档都有一个
URL
和一些
Content
。对于每个给定的线程,我将每个
Post
索引为单独的文档(将整个线程索引为单个文档在搜索时会返回太多误报)。

我遇到的问题是处理结果集中具有相同

Post
的多个
URL
文档。当我搜索并返回 10 个结果时,我希望每个结果引用不同的
URL

目前,我有以下内容:

// setup
StandardAnalyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
FSDirectory directory = FSDirectory.Open(indexLocation);
IndexSearcher searcher = new IndexSearcher(directory);
parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "body", analyzer);

// search
Query query = parser.Parse(queryString);
TopDocs topDocs = searcher.Search(query, null, 10);

但是,在返回的 10 个结果中,可能只有 7 个是唯一的

URLs
。我考虑过在再次搜索之前丢弃这些重复项,返回更大的结果集并丢弃前 10 个(类似于分页),直到我有 10 个唯一的
URLs
,但这引发了一些问题,例如我应该何时停止,因为没有更多结果?等等

感觉应该有一种在

TopDocs topDocs = searcher.Search()
点进行过滤的方法,返回10个具有唯一
URLs
的结果。我找不到任何与此相关的内容(也许我没有使用正确的术语),但我确信很多其他应用程序之前一定已经解决了这个问题...类似的事情是否已经存在,或者任何人都可以提供指针如何实施?

c# lucene.net
1个回答
0
投票

我为 lucene 2.9.x 开发了额外过滤器等功能 现在我发现它应该为 4.8 完全重写

因此,如果您使用 2.9.x 版本,则有解决方案: 您自己的命中收集器,在方法收集中还应该检查文档是否位于唯一文档文档集(位数组)中。位数组必须被构造并缓存(来源):

public class DistinctValuesFilter
{
    #region ctor
    public DistinctValuesFilter(IndexReader searchReader, ISearchRequest request, int docLength, Analyzer anlzr)
    {
        this.distinctBy = StringHelper.Intern(request.DistinctBy);
        this.processedMask = new OpenBitSetDISI(docLength);
        FindDuplicateTermsDirectly(searchReader);
        iireader = searchReader;
    }

    /// <summary>
    /// initialize: duplicates hashset and array of possitions where duplicates are situated
    /// code partially has been takend from lucene: Lucene.Net.Search.FieldCacheImpl.StringIndexCache       
    /// protected internal override object CreateValue(IndexReader reader, Entry entryKey)
    /// </summary>
    /// <param name="ireader">index reader to process duplicates</param>
    private void FindDuplicateTermsDirectly(IndexReader ireader)
    {
        var maxlemgth = ireader.MaxDoc();
        duplicates = new HashSet<int>(maxlemgth);
        duplicatesLocations = new int[maxlemgth];
        var termEnum = ireader.Terms(new Term(this.distinctBy));
        var termDocs = ireader.TermDocs();
        int num = 0;
        int k, p;
        int firstDocNo = -1;

        try
        {
            do
            {
                var term = termEnum.Term();
                if (term == null || term.Field() != this.distinctBy || num >= maxlemgth)
                    break;
                termDocs.Seek(termEnum);
                p = 0;
                while (termDocs.Next())
                {
                    k = termDocs.Doc();
                    duplicatesLocations[k] = num + 1;//0- needs to indicate that records/document completelly empty
                    if (p > 0) duplicates.Add(firstDocNo);
                    firstDocNo = k;
                    p++;
                }
                if (p > 1) duplicates.Add(firstDocNo);
                num++;
            }
            while (termEnum.Next());
        }
        finally
        {
            termDocs.Close();
            termEnum.Close();
        }
    }

    protected IndexReader iireader;
    protected string distinctBy;

    private HashSet<int> duplicates;
    private int[] duplicatesLocations;
    private OpenBitSetDISI processedMask;


    #endregion
    private Lazy<string[]> ivalue => new Lazy<string[]>(() => FieldCache_Fields.DEFAULT.GetStrings(iireader, distinctBy));
    public bool IsDistinct(int docIndex)
    {
        if (this.processedMask.FastGet(docIndex)) return false;
        if (duplicatesLocations[docIndex] == 0) return false;//when value doesn't exist completelly
        if (!duplicates.Contains(docIndex)) return true;
        var dval = duplicatesLocations[docIndex];
        var v = ivalue.Value;
        var xv = string.Empty;
        for (int i = 0; i < duplicatesLocations.Length; i++)
        {
            if (duplicatesLocations[i] == dval)
            {
                this.processedMask.FastSet(i);
                if (!string.IsNullOrEmpty(xv) && xv != v[i])
                {
                    throw new NotSupportedException($"values are not same ({i}): [{xv}] != [{v[i]}]");
                }
                xv = v[i];
            }
        }
        return true;
    }
}

© www.soinside.com 2019 - 2024. All rights reserved.