From: Tom Lane Date: Sat, 25 Aug 2007 06:26:57 +0000 (+0000) Subject: Some more tsearch docs work --- sync names with CVS-tip reality, some X-Git-Tag: REL9_0_0~5160 X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=52a0830c407e7743062d26cef9f4c6a27c897f08;p=pg-rex%2Fsyncrep.git Some more tsearch docs work --- sync names with CVS-tip reality, some minor rewording, some markup fixups. Lots left to do here ... --- diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 0d1ab50002..5124bd80ae 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -210,9 +210,9 @@ SELECT 'a:1 fat:2 cat:3 sat:4 on:5 a:6 mat:7 and:8 ate:9 a:10 fat:11 rat:12'::ts 'a':1,6,10 'on':5 'and':8 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4 -Each lexeme position also can be labeled as 'A', -'B', 'C', 'D', -where 'D' is the default. These labels can be used to group +Each lexeme position also can be labeled as A, +B, C, D, +where D is the default. These labels can be used to group lexemes into different importance or rankings, for example to reflect document structure. Actual values can be assigned at search time and used during the calculation @@ -668,9 +668,9 @@ setweight(vector TSVECTOR, This function returns a copy of the input vector in which every location -has been labeled with either the letter 'A', -'B', or 'C', or the default label -'D' (which is the default for new vectors +has been labeled with either the letter A, +B, or C, or the default label +D (which is the default for new vectors and as such is usually not displayed). These labels are retained when vectors are concatenated, allowing words from different parts of a document to be weighted differently by ranking functions. @@ -807,13 +807,12 @@ to be made. -stat +ts_stat -stat(sqlquery text , weight text ) returns SETOF statinfo - +ts_stat(sqlquery text , weights text ) returns SETOF statinfo @@ -821,27 +820,27 @@ stat(sqlquery text Here statinfo is a type, defined as: -CREATE TYPE statinfo AS (word text, ndoc int4, nentry int4); +CREATE TYPE statinfo AS (word text, ndoc integer, nentry integer); -and sqlquery is a query which returns a -tsvector column's contents. stat returns -statistics about a tsvector column, i.e., the number of -documents, ndoc, and the total number of words in the -collection, nentry. It is useful for checking your -configuration and to find stop word candidates. For example, to find -the ten most frequent words: +and sqlquery is a text value containing a SQL query +which returns a single tsvector column. ts_stat +executes the query and returns statistics about the resulting +tsvector data, i.e., the number of documents, ndoc, +and the total number of words in the collection, nentry. It is +useful for checking your configuration and to find stop word candidates. For +example, to find the ten most frequent words: -SELECT * FROM stat('SELECT vector from apod') +SELECT * FROM ts_stat('SELECT vector from apod') ORDER BY ndoc DESC, nentry DESC, word LIMIT 10; -Optionally, one can specify weight to obtain +Optionally, one can specify weights to obtain statistics about words with a specific weight: -SELECT * FROM stat('SELECT vector FROM apod','a') +SELECT * FROM ts_stat('SELECT vector FROM apod','a') ORDER BY ndoc DESC, nentry DESC, word LIMIT 10; @@ -1146,9 +1145,9 @@ topic. -The rewrite() function changes the original query by +The ts_rewrite() function changes the original query by replacing part of the query with some other string of type tsquery, -as defined by the rewrite rule. Arguments to rewrite() +as defined by the rewrite rule. Arguments to ts_rewrite() can be names of columns of type tsquery. @@ -1161,20 +1160,20 @@ INSERT INTO aliases VALUES('a', 'c'); -rewrite - 1 +ts_rewrite -rewrite (query TSQUERY, target TSQUERY, sample TSQUERY) returns TSQUERY +ts_rewrite (query TSQUERY, target TSQUERY, sample TSQUERY) returns TSQUERY -SELECT rewrite('a & b'::tsquery, 'a'::tsquery, 'c'::tsquery); - rewrite +SELECT ts_rewrite('a & b'::tsquery, 'a'::tsquery, 'c'::tsquery); + ts_rewrite ----------- 'b' & 'c' @@ -1184,21 +1183,17 @@ SELECT rewrite('a & b'::tsquery, 'a'::tsquery, 'c'::tsquery); - -rewrite - 2 - - -rewrite(ARRAY[query TSQUERY, target TSQUERY, sample TSQUERY]) returns TSQUERY +ts_rewrite(ARRAY[query TSQUERY, target TSQUERY, sample TSQUERY]) returns TSQUERY -SELECT rewrite(ARRAY['a & b'::tsquery, t,s]) FROM aliases; - rewrite +SELECT ts_rewrite(ARRAY['a & b'::tsquery, t,s]) FROM aliases; + ts_rewrite ----------- 'b' & 'c' @@ -1208,21 +1203,17 @@ SELECT rewrite(ARRAY['a & b'::tsquery, t,s]) FROM aliases; - -rewrite - 3 - - -rewrite (query TSQUERY,'SELECT target ,sample FROM test'::text) returns TSQUERY +ts_rewrite (query TSQUERY,'SELECT target ,sample FROM test'::text) returns TSQUERY -SELECT rewrite('a & b'::tsquery, 'SELECT t,s FROM aliases'); - rewrite +SELECT ts_rewrite('a & b'::tsquery, 'SELECT t,s FROM aliases'); + ts_rewrite ----------- 'b' & 'c' @@ -1246,12 +1237,12 @@ SELECT * FROM aliases; This ambiguity can be resolved by specifying a sort order: -SELECT rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t DESC'); - rewrite +SELECT ts_rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t DESC'); + ts_rewrite --------- 'cc' -SELECT rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t ASC'); - rewrite +SELECT ts_rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t ASC'); + ts_rewrite ----------- 'b' & 'c' @@ -1263,7 +1254,7 @@ Let's consider a real-life astronomical example. We'll expand query CREATE TABLE aliases (t tsquery primary key, s tsquery); INSERT INTO aliases VALUES(to_tsquery('supernovae'), to_tsquery('supernovae|sn')); -SELECT rewrite(to_tsquery('supernovae'), 'SELECT * FROM aliases') && to_tsquery('crab'); +SELECT ts_rewrite(to_tsquery('supernovae'), 'SELECT * FROM aliases') && to_tsquery('crab'); ?column? --------------------------------- ( 'supernova' | 'sn' ) & 'crab' @@ -1271,7 +1262,7 @@ SELECT rewrite(to_tsquery('supernovae'), 'SELECT * FROM aliases') && to Notice, that we can change the rewriting rule online: UPDATE aliases SET s=to_tsquery('supernovae|sn & !nebulae') WHERE t=to_tsquery('supernovae'); -SELECT rewrite(to_tsquery('supernovae'), 'SELECT * FROM aliases') && to_tsquery('crab'); +SELECT ts_rewrite(to_tsquery('supernovae'), 'SELECT * FROM aliases') && to_tsquery('crab'); ?column? --------------------------------------------- ( 'supernova' | 'sn' & !'nebula' ) & 'crab' @@ -1288,10 +1279,10 @@ for a possible hit. To filter out obvious non-candidate rules there are containm operators for the tsquery type. In the example below, we select only those rules which might contain the original query: -SELECT rewrite(ARRAY['a & b'::tsquery, t,s]) +SELECT ts_rewrite(ARRAY['a & b'::tsquery, t,s]) FROM aliases WHERE 'a & b' @> t; - rewrite + ts_rewrite ----------- 'b' & 'c' @@ -1525,7 +1516,7 @@ SELECT * FROM ts_parse('default','123 - a number'); -token_type +ts_token_type @@ -1894,11 +1885,13 @@ configuration config_name is realized by superimposed coding (Knuth, 1973) of signatures, i.e., a parent is the result of 'OR'-ing the bit-strings of all children. This is a second factor of lossiness. It is clear that parents tend to be full of -'1's (degenerates) and become quite useless because of the +1s (degenerates) and become quite useless because of the limited selectivity. Searching is performed as a bit comparison of a signature representing the query and an RD-tree entry. -If all '1's of both signatures are in the same position we +If all 1s of both signatures are in the same position we say that this branch probably matches the query, but if there is even one discrepancy we can definitely reject this branch. @@ -2870,13 +2863,15 @@ The current limitations of Full Text Searching are: For comparison, the PostgreSQL 8.1 documentation -consists of 10,441 unique words, a total of 335,420 words, and the most frequent word -'postgresql' is mentioned 6,127 times in 655 documents. +contained 10,441 unique words, a total of 335,420 words, and the most frequent +word postgresql was mentioned 6,127 times in 655 documents. + -Another example - the PostgreSQL mailing list archives -consists of 910,989 unique words with 57,491,343 lexemes in 461,020 messages. +Another example — the PostgreSQL mailing list +archives contained 910,989 unique words with 57,491,343 lexemes in 461,020 +messages. @@ -2942,28 +2937,27 @@ names and object names. The following examples illustrate this: => \dF+ russian Configuration "pg_catalog.russian" Parser name: "pg_catalog.default" -Locale: 'ru_RU.UTF-8' (default) Token | Dictionaries --------------+------------------------- email | pg_catalog.simple file | pg_catalog.simple float | pg_catalog.simple host | pg_catalog.simple - hword | pg_catalog.ru_stem_utf8 + hword | pg_catalog.russian_stem int | pg_catalog.simple lhword | public.tz_simple lpart_hword | public.tz_simple lword | public.tz_simple - nlhword | pg_catalog.ru_stem_utf8 - nlpart_hword | pg_catalog.ru_stem_utf8 - nlword | pg_catalog.ru_stem_utf8 + nlhword | pg_catalog.russian_stem + nlpart_hword | pg_catalog.russian_stem + nlword | pg_catalog.russian_stem part_hword | pg_catalog.simple sfloat | pg_catalog.simple uint | pg_catalog.simple uri | pg_catalog.simple url | pg_catalog.simple version | pg_catalog.simple - word | pg_catalog.ru_stem_utf8 + word | pg_catalog.russian_stem @@ -3112,43 +3106,43 @@ play with the standard english configuration. CREATE TEXT SEARCH CONFIGURATION public.english ( COPY = pg_catalog.english ); -CREATE TEXT SEARCH DICTIONARY en_ispell ( +CREATE TEXT SEARCH DICTIONARY english_ispell ( TEMPLATE = ispell, - DictFile = english-utf8, - AffFile = english-utf8, + DictFile = english, + AffFile = english, StopWords = english ); ALTER TEXT SEARCH CONFIGURATION public.english - ALTER MAPPING FOR lword WITH en_ispell, en_stem; + ALTER MAPPING FOR lword WITH english_ispell, english_stem; SELECT * FROM ts_debug('public.english','The Brightest supernovaes'); Alias | Description | Token | Dicts list | Lexized token -------+---------------+-------------+---------------------------------------+--------------------------------- - lword | Latin word | The | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {} + lword | Latin word | The | {public.english_ispell,pg_catalog.english_stem} | public.english_ispell: {} blank | Space symbols | | | - lword | Latin word | Brightest | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {bright} + lword | Latin word | Brightest | {public.english_ispell,pg_catalog.english_stem} | public.english_ispell: {bright} blank | Space symbols | | | - lword | Latin word | supernovaes | {public.en_ispell,pg_catalog.en_stem} | pg_catalog.en_stem: {supernova} + lword | Latin word | supernovaes | {public.english_ispell,pg_catalog.english_stem} | pg_catalog.english_stem: {supernova} (5 rows) -In this example, the word 'Brightest' was recognized by a +In this example, the word Brightest was recognized by a parser as a Latin word (alias lword) -and came through the dictionaries public.en_ispell and -pg_catalog.en_stem. It was recognized by -public.en_ispell, which reduced it to the noun +and came through the dictionaries public.english_ispell and +pg_catalog.english_stem. It was recognized by +public.english_ispell, which reduced it to the noun bright. The word supernovaes is unknown -by the public.en_ispell dictionary so it was passed to +by the public.english_ispell dictionary so it was passed to the next dictionary, and, fortunately, was recognized (in fact, -public.en_stem is a stemming dictionary and recognizes +public.english_stem is a stemming dictionary and recognizes everything; that is why it was placed at the end of the dictionary stack). -The word The was recognized by public.en_ispell +The word The was recognized by public.english_ispell dictionary as a stop word () and will not be indexed. @@ -3159,11 +3153,11 @@ SELECT "Alias", "Token", "Lexized token" FROM ts_debug('public.english','The Brightest supernovaes'); Alias | Token | Lexized token -------+-------------+--------------------------------- - lword | The | public.en_ispell: {} + lword | The | public.english_ispell: {} blank | | - lword | Brightest | public.en_ispell: {bright} + lword | Brightest | public.english_ispell: {bright} blank | | - lword | supernovaes | pg_catalog.en_stem: {supernova} + lword | supernovaes | pg_catalog.english_stem: {supernova} (5 rows)