File python-xapian-haystack-term-too-long.patch of Package python-xapian-haystack
From d1346dc53f35b0960badf6435cff468c648de694 Mon Sep 17 00:00:00 2001
From: Martin Owens <[email protected]>
Date: Tue, 18 Dec 2018 23:15:49 -0500
Subject: [PATCH] Add Xapian Omega solution to haystack backend to fix long
term issues
---
xapian_backend.py | 31 ++++++++++++++++++++++++++++++-
1 file changed, 30 insertions(+), 1 deletion(-)
diff -ruN a/xapian_backend.py b/xapian_backend.py
--- a/xapian_backend.py 2023-03-19 12:30:18.000000000 +0100
+++ b/xapian_backend.py 2023-12-15 11:00:40.206114151 +0100
@@ -22,6 +22,10 @@
NGRAM_MIN_LENGTH = getattr(settings, 'XAPIAN_NGRAM_MIN_LENGTH', 2)
NGRAM_MAX_LENGTH = getattr(settings, 'XAPIAN_NGRAM_MAX_LENGTH', 15)
+LONG_TERM = re.compile(b'[^\s]{239,}')
+LONG_TERM_METHOD = getattr(settings, 'XAPIAN_LONG_TERM_METHOD', 'truncate')
+LONG_TERM_LENGTH = getattr(settings, 'XAPIAN_LONG_TERM_LENGTH', 240)
+
try:
import xapian
except ImportError:
@@ -1630,7 +1634,33 @@
Converts a Python type to a
Xapian term that can be indexed.
"""
- return str(term).lower()
+ value = str(term).lower()
+ if LONG_TERM_METHOD:
+ value = _ensure_term_length(value)
+ return value
+
+def _ensure_term_length(text):
+ """
+ Ensures that terms are not too long, this helps protect against long urls
+ and CJK terms which are not tokenised by Xapian (and so are unsupported)
+ """
+ # Text must operate on bytes, not unicode, because xapian's term limit is
+ # a byte restriction length, not a char limit length.
+ text = text.encode('utf8')
+
+ for match in reversed(list(LONG_TERM.finditer(text))):
+ hole = text[match.start():match.end()]
+ # There are two options available in xapian's omega project. We re-create
+ # these two options here using python code.
+ if LONG_TERM_METHOD == 'truncate':
+ hole = hole[:LONG_TERM_LENGTH]
+ elif LONG_TERM_METHOD == 'hash':
+ from hashlib import sha224
+ hole = sha224(hole.encode('utf8')).hexdigest()
+ text = text[:match.start()] + hole + text[match.end():]
+
+ # We ignore any errors because truncate may have chopped a unicode in half.
+ return text.decode('utf8', 'ignore')
def _from_xapian_value(value, field_type):