摘要:Lucene.Net 實作建議字搜尋
Lucene.Net 有一 Contrib Library , SpellChecker
它提供搜尋建議的功能 , 例如下面的範例 ,
例如我們在 Google 輸入 apach , 可是實際正確的字串應該要是 apache
目前顯示的是以下字詞的搜尋結果: apache
您可以改回搜尋: apach
若要取得 Library 可到 網址 , 其可透過 Visual Studio 2010 的 Package Manager Console 下載
Step 1 : Build RAMDirectory
RAMDirectory dir = new RAMDirectory();
Step 2 : Build Index :
我們在 Line 4 and Line 9 建立了 text 欄位 , 並且設定 value 為 "this is a writen writing with some words"
IndexWriter iw = new IndexWriter(dir, new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30), IndexWriter.MaxFieldLength.UNLIMITED);
Document d = new Document();
Field textField = new Field("text", "", Field.Store.YES, Field.Index.ANALYZED);
Field idField = new Field("id", "", Field.Store.YES, Field.Index.NOT_ANALYZED);
d.Add(textField);
d.Add(idField);
textField.SetValue("this is a writen writing with a some words");
idField.SetValue("42");
iw.AddDocument(d);
iw.Commit();
Step 3 : Search
我們在 Line 2 宣告了 SpellChecker , 而 SpellChecker 會搜尋自己獨立出來的 Index 儲存體 ,
在這裡我們直接讓他儲存在記憶體中 , 而在 Line 4 ,我們想要找出跟 write 相似的字串 , 並且只要回傳 5 個字串就好
IndexReader reader = iw.GetReader();
SpellChecker.Net.Search.Spell.SpellChecker speller = new SpellChecker.Net.Search.Spell.SpellChecker(new RAMDirectory());
speller.IndexDictionary(new LuceneDictionary(reader, "text"));
string[] suggestions = speller.SuggestSimilar("write", 5);
IndexSearcher searcher = new IndexSearcher(reader);
foreach (string suggestion in suggestions)
{
Response.Write(suggestion + "
");
}
reader.Dispose();
iw.Dispose();
Result :
writen
writing
接下來我們就來探討它是怎麼判斷相似的字串 , 使用 ILSpy 來查看其原始碼 :
首先來查看 SuggestSimilar 方法 , 特別是在 Line 46 中使用了一個 GetDistance 方法
public virtual string[] SuggestSimilar(string word, int numSug, IndexReader ir, string field, bool morePopular)
{
IndexSearcher indexSearcher = this.ObtainSearcher();
string[] result;
try
{
float min = this.minScore;
int lengthWord = word.Length;
int freq = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0;
int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;
if (!morePopular && freq > 0)
{
result = new string[]
{
word
};
}
else
{
BooleanQuery query = new BooleanQuery();
HashSet alreadySeen = new HashSet();
for (int ng = this.GetMin(lengthWord); ng <= this.GetMax(lengthWord); ng++)
{
string key = "gram" + ng;
string[] grams = SpellChecker.FormGrams(word, ng);
if (grams.Length != 0)
{
SpellChecker.Add(query, "start" + ng, grams[0], 2f);
SpellChecker.Add(query, "end" + ng, grams[grams.Length - 1], 1f);
for (int i = 0; i < grams.Length; i++)
{
SpellChecker.Add(query, key, grams[i]);
}
}
}
int maxHits = 10 * numSug;
ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).ScoreDocs;
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
int stop = Math.Min(hits.Length, maxHits);
SuggestWord sugWord = new SuggestWord();
for (int j = 0; j < stop; j++)
{
sugWord.termString = indexSearcher.Doc(hits[j].Doc).Get("word");
if (!sugWord.termString.Equals(word))
{
sugWord.score = this.sd.GetDistance(word, sugWord.termString);
if (sugWord.score >= min)
{
if (ir != null && field != null)
{
sugWord.freq = ir.DocFreq(new Term(field, sugWord.termString));
if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1)
{
goto IL_21E;
}
}
if (alreadySeen.Add(sugWord.termString))
{
sugQueue.InsertWithOverflow(sugWord);
if (sugQueue.Size() == numSug)
{
min = sugQueue.Top().score;
}
sugWord = new SuggestWord();
}
}
}
IL_21E:;
}
string[] list = new string[sugQueue.Size()];
for (int k = sugQueue.Size() - 1; k >= 0; k--)
{
list[k] = sugQueue.Pop().termString;
}
result = list;
}
}
finally
{
this.ReleaseSearcher(indexSearcher);
}
return result;
}
我們來看看 Lucene.Net API 如何描述這個方法
Returns a float between 0 and 1 based on how similar the specified strings are to one another. Returning a value of 1 means the specified strings are identical and 0 means the string are maximally different
從這描述可以知道它會判斷兩字串是否相似 , 而 GetDistance 其實是一個介面 , 它的實作被定義在
JaroWinklerDistance , NGramDistance , LevenshteinDistance 這三個 class 裡面 ,
從這三個 class 的名稱可以知道分別是三種演算法 ,
Jaro–Winkler distance , n-gram distance , Levenshtein distance
這三種演算法特別用在 短 字串中用來判斷兩字串是否相似 ,
有興趣的可以去這些 class 裡研究研究