--- /tmp/python-whoosh-2.7.4+git6-g9134ad92-5yotb8t9p/debian/python-whoosh-doc_2.7.4+git6-g9134ad92-5_all.deb +++ python-whoosh-doc_2.7.4+git6-g9134ad92-5_all.deb ├── file list │ @@ -1,3 +1,3 @@ │ -rw-r--r-- 0 0 0 4 2020-01-13 18:10:26.000000 debian-binary │ -rw-r--r-- 0 0 0 4332 2020-01-13 18:10:26.000000 control.tar.xz │ --rw-r--r-- 0 0 0 238416 2020-01-13 18:10:26.000000 data.tar.xz │ +-rw-r--r-- 0 0 0 238384 2020-01-13 18:10:26.000000 data.tar.xz ├── control.tar.xz │ ├── control.tar │ │ ├── ./md5sums │ │ │ ├── ./md5sums │ │ │ │┄ Files differ ├── data.tar.xz │ ├── data.tar │ │ ├── ./usr/share/doc/python-whoosh-doc/html/api/analysis.html │ │ │ @@ -140,15 +140,15 @@ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │
│ │ │
│ │ │ -whoosh.analysis.StandardAnalyzer(expression=re.compile('\\w+(\\.?\\w+)*'), stoplist=frozenset({'as', 'is', 'it', 'your', 'may', 'with', 'if', 'an', 'the', 'tbd', 'at', 'you', 'for', 'in', 'have', 'we', 'be', 'can', 'yet', 'a', 'that', 'of', 'from', 'to', 'by', 'when', 'this', 'are', 'us', 'not', 'will', 'or', 'and', 'on'}), minsize=2, maxsize=None, gaps=False)
│ │ │ +whoosh.analysis.StandardAnalyzer(expression=re.compile('\\w+(\\.?\\w+)*'), stoplist=frozenset({'an', 'it', 'when', 'if', 'or', 'not', 'this', 'for', 'we', 'us', 'to', 'on', 'at', 'and', 'yet', 'with', 'can', 'of', 'may', 'have', 'is', 'the', 'be', 'tbd', 'you', 'your', 'from', 'that', 'as', 'by', 'are', 'will', 'in', 'a'}), minsize=2, maxsize=None, gaps=False) │ │ │

Composes a RegexTokenizer with a LowercaseFilter and optional │ │ │ StopFilter.

│ │ │
>>> ana = StandardAnalyzer()
│ │ │  >>> [token.text for token in ana("Testing is testing and testing")]
│ │ │  ["testing", "testing", "testing"]
│ │ │  
│ │ │
│ │ │ @@ -169,15 +169,15 @@ │ │ │ │ │ │ │ │ │ │ │ │
│ │ │ │ │ │
│ │ │
│ │ │ -whoosh.analysis.StemmingAnalyzer(expression=re.compile('\\w+(\\.?\\w+)*'), stoplist=frozenset({'as', 'is', 'it', 'your', 'may', 'with', 'if', 'an', 'the', 'tbd', 'at', 'you', 'for', 'in', 'have', 'we', 'be', 'can', 'yet', 'a', 'that', 'of', 'from', 'to', 'by', 'when', 'this', 'are', 'us', 'not', 'will', 'or', 'and', 'on'}), minsize=2, maxsize=None, gaps=False, stemfn=<function stem>, ignore=None, cachesize=50000)
│ │ │ +whoosh.analysis.StemmingAnalyzer(expression=re.compile('\\w+(\\.?\\w+)*'), stoplist=frozenset({'an', 'it', 'when', 'if', 'or', 'not', 'this', 'for', 'we', 'us', 'to', 'on', 'at', 'and', 'yet', 'with', 'can', 'of', 'may', 'have', 'is', 'the', 'be', 'tbd', 'you', 'your', 'from', 'that', 'as', 'by', 'are', 'will', 'in', 'a'}), minsize=2, maxsize=None, gaps=False, stemfn=<function stem>, ignore=None, cachesize=50000) │ │ │

Composes a RegexTokenizer with a lower case filter, an optional stop │ │ │ filter, and a stemming filter.

│ │ │
>>> ana = StemmingAnalyzer()
│ │ │  >>> [token.text for token in ana("Testing is testing and testing")]
│ │ │  ["test", "test", "test"]
│ │ │  
│ │ │
│ │ │ @@ -202,15 +202,15 @@ │ │ │ │ │ │ │ │ │ │ │ │
│ │ │ │ │ │
│ │ │
│ │ │ -whoosh.analysis.FancyAnalyzer(expression='\\s+', stoplist=frozenset({'as', 'is', 'it', 'your', 'may', 'with', 'if', 'an', 'the', 'tbd', 'at', 'you', 'for', 'in', 'have', 'we', 'be', 'can', 'yet', 'a', 'that', 'of', 'from', 'to', 'by', 'when', 'this', 'are', 'us', 'not', 'will', 'or', 'and', 'on'}), minsize=2, maxsize=None, gaps=True, splitwords=True, splitnums=True, mergewords=False, mergenums=False)
│ │ │ +whoosh.analysis.FancyAnalyzer(expression='\\s+', stoplist=frozenset({'an', 'it', 'when', 'if', 'or', 'not', 'this', 'for', 'we', 'us', 'to', 'on', 'at', 'and', 'yet', 'with', 'can', 'of', 'may', 'have', 'is', 'the', 'be', 'tbd', 'you', 'your', 'from', 'that', 'as', 'by', 'are', 'will', 'in', 'a'}), minsize=2, maxsize=None, gaps=True, splitwords=True, splitnums=True, mergewords=False, mergenums=False) │ │ │

Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and │ │ │ StopFilter.

│ │ │
>>> ana = FancyAnalyzer()
│ │ │  >>> [token.text for token in ana("Should I call getInt or get_real?")]
│ │ │  ["should", "call", "getInt", "get", "int", "get_real", "get", "real"]
│ │ │  
│ │ │
│ │ │ @@ -511,15 +511,15 @@ │ │ │
│ │ │ class whoosh.analysis.StripFilter
│ │ │

Calls unicode.strip() on the token text.

│ │ │
│ │ │ │ │ │
│ │ │
│ │ │ -class whoosh.analysis.StopFilter(stoplist=frozenset({'as', 'is', 'it', 'your', 'may', 'with', 'if', 'an', 'the', 'tbd', 'at', 'you', 'for', 'in', 'have', 'we', 'be', 'can', 'yet', 'a', 'that', 'of', 'from', 'to', 'by', 'when', 'this', 'are', 'us', 'not', 'will', 'or', 'and', 'on'}), minsize=2, maxsize=None, renumber=True, lang=None)
│ │ │ +class whoosh.analysis.StopFilter(stoplist=frozenset({'an', 'it', 'when', 'if', 'or', 'not', 'this', 'for', 'we', 'us', 'to', 'on', 'at', 'and', 'yet', 'with', 'can', 'of', 'may', 'have', 'is', 'the', 'be', 'tbd', 'you', 'your', 'from', 'that', 'as', 'by', 'are', 'will', 'in', 'a'}), minsize=2, maxsize=None, renumber=True, lang=None) │ │ │

Marks “stop” words (words too common to index) in the stream (and by │ │ │ default removes them).

│ │ │

Make sure you precede this filter with a LowercaseFilter.

│ │ │
>>> stopper = RegexTokenizer() | StopFilter()
│ │ │  >>> [token.text for token in stopper(u"this is a test")]
│ │ │  ["test"]
│ │ │  >>> es_stopper = RegexTokenizer() | StopFilter(lang="es")