The tsvector data type has always forbidden lexemes to be empty.
However, array_to_tsvector() didn't get that memo, and would
allow an empty-string array element to become an empty lexeme.
This could result in dump/restore failures later, not to mention
whatever semantic issues might be behind the original prohibition.
However, other functions that take a plain text input directly as
a lexeme value do not need a similar restriction, because they only
match the string against existing tsvector entries. In particular
it'd be a bad idea to make ts_delete() reject empty strings, since
that is the most convenient way to clean up any bad data that might
have gotten into a tsvector column via this bug.
Reflecting on that, let's also remove the prohibition against NULL
array elements in tsvector_delete_arr and tsvector_setweight_by_filter.
It seems more consistent to ignore them, as an empty-string element
would be ignored.
There's a case for back-patching this, since it's clearly a bug fix.
On balance though, it doesn't seem like something to change in a
minor release.
Jean-Christophe Arnu
Discussion: https://postgr.es/m/CAHZmTm1YVndPgUVRoag2WL0w900XcoiivDDj-gTTYBsG25c65A@mail.gmail.com
tsvector
- Converts an array of lexemes to a tsvector.
- The given strings are used as-is without further processing.
+ Converts an array of text strings to a tsvector.
+ The given strings are used as lexemes as-is, without further
+ processing. Array elements must not be empty strings
+ or NULL.
array_to_tsvector('{fat,cat,rat}'::text[])
Assigns the specified
weight to elements
of the
vector that are listed
+ The strings in
lexemes are taken as lexemes
+ as-is, without further processing. Strings that do not match any
+ lexeme in
vector are ignored.
setweight('fat:2,4 cat:3 rat:5,6B'::tsvector, 'A', '{cat,rat}')
Removes any occurrence of the given
lexeme
+ The
lexeme string is treated as a lexeme as-is,
+ without further processing.
ts_delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat')
Removes any occurrences of the lexemes
+ The strings in
lexemes are taken as lexemes
+ as-is, without further processing. Strings that do not match any
+ lexeme in
vector are ignored.
ts_delete('fat:2,4 cat:3 rat:5A'::tsvector, ARRAY['fat','rat'])
int lex_len,
lex_pos;
+ /* Ignore null array elements, they surely don't match */
if (nulls[i])
- ereport(ERROR,
- (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
- errmsg("lexeme array may not contain nulls")));
+ continue;
lex = VARDATA(dlexemes[i]);
lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
int lex_len,
lex_pos;
+ /* Ignore null array elements, they surely don't match */
if (nulls[i])
- ereport(ERROR,
- (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
- errmsg("lexeme array may not contain nulls")));
+ continue;
lex = VARDATA(dlexemes[i]);
lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
deconstruct_array(v, TEXTOID, -1, false, TYPALIGN_INT, &dlexemes, &nulls, &nitems);
- /* Reject nulls (maybe we should just ignore them, instead?) */
+ /*
+ * Reject nulls and zero length strings (maybe we should just ignore them,
+ * instead?)
+ */
for (i = 0; i < nitems; i++)
{
if (nulls[i])
ereport(ERROR,
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("lexeme array may not contain nulls")));
+
+ if (VARSIZE(dlexemes[i]) - VARHDRSZ == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
+ errmsg("lexeme array may not contain empty strings")));
}
/* Sort and de-dup, because this is required for a valid tsvector. */
'a':3A,4B 'b':2A 'ba':1237
(1 row)
+SELECT $$'' '1' '2'$$::tsvector; -- error, empty lexeme is not allowed
+ERROR: syntax error in tsvector: "'' '1' '2'"
+LINE 1: SELECT $$'' '1' '2'$$::tsvector;
+ ^
--Base tsquery test
SELECT '1'::tsquery;
tsquery
'base' 'hidden' 'strike'
(1 row)
-SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
-ERROR: lexeme array may not contain nulls
+SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', '', NULL]);
+ ts_delete
+--------------------------
+ 'base' 'hidden' 'strike'
+(1 row)
+
SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
unnest
---------------------------------------------
'base' 'hidden' 'rebel' 'spaceship' 'strike'
(1 row)
+-- null and empty string are disallowed, since we mustn't make an empty lexeme
SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]);
ERROR: lexeme array may not contain nulls
+SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', '']);
+ERROR: lexeme array may not contain empty strings
-- array_to_tsvector must sort and de-dup
SELECT array_to_tsvector(ARRAY['foo','bar','baz','bar']);
array_to_tsvector
'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81C,222C,567C
(1 row)
-SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', '', NULL]);
setweight
---------------------------------
'a' 'asd' 'w':5,6,12B,13A 'zxc'
(1 row)
-SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]);
-ERROR: lexeme array may not contain nulls
SELECT ts_filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
ts_filter
-------------------------------------------------------------
SELECT tsvectorin(tsvectorout($$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector));
SELECT '''w'':4A,3B,2C,1D,5 a:8';
SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
+SELECT $$'' '1' '2'$$::tsvector; -- error, empty lexeme is not allowed
--Base tsquery test
SELECT '1'::tsquery;
SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel','rebel']);
-SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
+SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', '', NULL]);
SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
SELECT unnest('base hidden rebel spaceship strike'::tsvector);
SELECT tsvector_to_array('base hidden rebel spaceship strike'::tsvector);
SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+-- null and empty string are disallowed, since we mustn't make an empty lexeme
SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]);
+SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', '']);
-- array_to_tsvector must sort and de-dup
SELECT array_to_tsvector(ARRAY['foo','bar','baz','bar']);
SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
-SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
-SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]);
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', '', NULL]);
SELECT ts_filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
SELECT ts_filter('base hidden rebel spaceship strike'::tsvector, '{a}');