File python-xapian-haystack-term-too-long.patch of Package python-xapian-haystack

From d1346dc53f35b0960badf6435cff468c648de694 Mon Sep 17 00:00:00 2001
From: Martin Owens <[email protected]>
Date: Tue, 18 Dec 2018 23:15:49 -0500
Subject: [PATCH] Add Xapian Omega solution to haystack backend to fix long
 term issues

---
 xapian_backend.py | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff -ruN a/xapian_backend.py b/xapian_backend.py
--- a/xapian_backend.py	2023-03-19 12:30:18.000000000 +0100
+++ b/xapian_backend.py	2023-12-15 11:00:40.206114151 +0100
@@ -22,6 +22,10 @@
 NGRAM_MIN_LENGTH = getattr(settings, 'XAPIAN_NGRAM_MIN_LENGTH', 2)
 NGRAM_MAX_LENGTH = getattr(settings, 'XAPIAN_NGRAM_MAX_LENGTH', 15)
 
+LONG_TERM = re.compile(b'[^\s]{239,}')
+LONG_TERM_METHOD = getattr(settings, 'XAPIAN_LONG_TERM_METHOD', 'truncate')
+LONG_TERM_LENGTH = getattr(settings, 'XAPIAN_LONG_TERM_LENGTH', 240)
+
 try:
     import xapian
 except ImportError:
@@ -1630,7 +1634,33 @@
     Converts a Python type to a
     Xapian term that can be indexed.
     """
-    return str(term).lower()
+    value = str(term).lower()
+    if LONG_TERM_METHOD:
+        value = _ensure_term_length(value)
+    return value
+ 
+def _ensure_term_length(text):
+    """
+    Ensures that terms are not too long, this helps protect against long urls
+    and CJK terms which are not tokenised by Xapian (and so are unsupported)
+    """
+    # Text must operate on bytes, not unicode, because xapian's term limit is
+    # a byte restriction length, not a char limit length.
+    text = text.encode('utf8')
+
+    for match in reversed(list(LONG_TERM.finditer(text))):
+        hole = text[match.start():match.end()]
+        # There are two options available in xapian's omega project. We re-create
+        # these two options here using python code.
+        if LONG_TERM_METHOD == 'truncate':
+            hole = hole[:LONG_TERM_LENGTH]
+        elif LONG_TERM_METHOD == 'hash':
+            from hashlib import sha224
+            hole = sha224(hole.encode('utf8')).hexdigest()
+        text = text[:match.start()] + hole + text[match.end():]
+
+    # We ignore any errors because truncate may have chopped a unicode in half.
+    return text.decode('utf8', 'ignore')
 
 
 def _from_xapian_value(value, field_type):
openSUSE Build Service is sponsored by
OSZAR »