# ====================================================================
# Copyright (c) 2004-2005 Open Source Applications Foundation.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions: 
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software. 
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
# ====================================================================
#

from unittest import TestCase, main
from PyLucene import *


class FuzzyQueryTestCase(TestCase):
    """
    Unit tests ported from Java Lucene
    """

    def _addDoc(self, text, writer):
        doc = Document()
        doc.add(Field("field", text,
                      Field.Store.YES, Field.Index.TOKENIZED))
        writer.addDocument(doc)

    def testDefaultFuzziness(self):

        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True)
        self._addDoc("aaaaa", writer)
        self._addDoc("aaaab", writer)
        self._addDoc("aaabb", writer)
        self._addDoc("aabbb", writer)
        self._addDoc("abbbb", writer)
        self._addDoc("bbbbb", writer)
        self._addDoc("ddddd", writer)
        writer.optimize()
        writer.close()

        searcher = IndexSearcher(directory)

        query = FuzzyQuery(Term("field", "aaaaa"))
        hits = searcher.search(query)
        self.assertEqual(3, hits.length())

        # not similar enough:
        query = FuzzyQuery(Term("field", "xxxxx"))
        hits = searcher.search(query)
        self.assertEqual(0, hits.length())
        # edit distance to "aaaaa" = 3
        query = FuzzyQuery(Term("field", "aaccc"))
        hits = searcher.search(query)
        self.assertEqual(0, hits.length())

        # query identical to a word in the index:
        query = FuzzyQuery(Term("field", "aaaaa"))
        hits = searcher.search(query)
        self.assertEqual(3, hits.length())
        self.assertEqual(hits.doc(0).get("field"), "aaaaa")
        # default allows for up to two edits:
        self.assertEqual(hits.doc(1).get("field"), "aaaab")
        self.assertEqual(hits.doc(2).get("field"), "aaabb")

        # query similar to a word in the index:
        query = FuzzyQuery(Term("field", "aaaac"))
        hits = searcher.search(query)
        self.assertEqual(3, hits.length())
        self.assertEqual(hits.doc(0).get("field"), "aaaaa")
        self.assertEqual(hits.doc(1).get("field"), "aaaab")
        self.assertEqual(hits.doc(2).get("field"), "aaabb")

        query = FuzzyQuery(Term("field", "ddddX"))
        hits = searcher.search(query)
        self.assertEqual(1, hits.length())
        self.assertEqual(hits.doc(0).get("field"), "ddddd")

        # different field = no match:
        query = FuzzyQuery(Term("anotherfield", "ddddX"))
        hits = searcher.search(query)
        self.assertEqual(0, hits.length())

        searcher.close()
        directory.close()

    def testDefaultFuzzinessLong(self):

        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True)
        self._addDoc("aaaaaaa", writer)
        self._addDoc("segment", writer)
        writer.optimize()
        writer.close()
        searcher = IndexSearcher(directory)

        # not similar enough:
        query = FuzzyQuery(Term("field", "xxxxx"))
        hits = searcher.search(query)
        self.assertEqual(0, hits.length())
        # edit distance to "aaaaaaa" = 3, this matches because
        # the string is longer than
        # in testDefaultFuzziness so a bigger difference is allowed:
        query = FuzzyQuery(Term("field", "aaaaccc"))
        hits = searcher.search(query)
        self.assertEqual(1, hits.length())
        self.assertEqual(hits.doc(0).get("field"), "aaaaaaa")

        # no match, more than half of the characters is wrong:
        query = FuzzyQuery(Term("field", "aaacccc"))
        hits = searcher.search(query)
        self.assertEqual(0, hits.length())

        # "student" and "stellent" are indeed similar to "segment" by default:
        query = FuzzyQuery(Term("field", "student"))
        hits = searcher.search(query)
        self.assertEqual(1, hits.length())
        query = FuzzyQuery(Term("field", "stellent"))
        hits = searcher.search(query)
        self.assertEqual(1, hits.length())

        searcher.close()
        directory.close()


if __name__ == "__main__":
    import sys
    if '-loop' in sys.argv:
        sys.argv.remove('-loop')
        while True:
            try:
                main()
            except:
                pass
    else:
         main()
