From 3e17ef1cfaa23f05c34cacb73c756bf03bf63f3a Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 22 Oct 2007 20:13:37 +0000 Subject: [PATCH] Adjust ts_debug's output as per my proposal of yesterday: show the active dictionary and its output lexemes as separate columns, instead of smashing them into one text column, and lowercase the column names. Also, define the output rowtype using OUT parameters instead of a composite type, to be consistent with the other built-in functions. --- doc/src/sgml/func.sgml | 8 +- doc/src/sgml/textsearch.sgml | 200 ++++++++++++++++++++--------------- src/backend/catalog/system_views.sql | 64 +++++------ src/include/catalog/catversion.h | 4 +- 4 files changed, 153 insertions(+), 123 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index afdda69720..368673c66e 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -1,4 +1,4 @@ - + Functions and Operators @@ -7857,11 +7857,11 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple - ts_debug( config regconfig, document text) - setof ts_debug + ts_debug( config regconfig, document text, OUT alias text, OUT description text, OUT token text, OUT dictionaries regdictionary[], OUT dictionary regdictionary, OUT lexemes text[]) + setof record test a configuration ts_debug('english', 'The Brightest supernovaes') - (lword,"Latin word",The,{english_stem},"english_stem: {}") ... + (lword,"Latin word",The,{english_stem},english_stem,{}) ... ts_lexize(dict regdictionary, token text) diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 03625b41a5..81b54d8e17 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1,4 +1,4 @@ - + Full Text Search @@ -1699,18 +1699,18 @@ ON messages FOR EACH ROW EXECUTE PROCEDURE messages_trigger(); - word text — the value of a lexeme + word text — the value of a lexeme - ndoc integer — number of documents + ndoc integer — number of documents (tsvectors) the word occurred in - nentry integer — total number of + nentry integer — total number of occurrences of the word @@ -1901,8 +1901,8 @@ LIMIT 10; as the entire word and as each component: -SELECT "Alias", "Description", "Token" FROM ts_debug('foo-bar-beta1'); - Alias | Description | Token +SELECT alias, description, token FROM ts_debug('foo-bar-beta1'); + alias | description | token -------------+-------------------------------+--------------- hword | Hyphenated word | foo-bar-beta1 lpart_hword | Latin part of hyphenated word | foo @@ -1917,8 +1917,8 @@ SELECT "Alias", "Description", "Token" FROM ts_debug('foo-bar-beta1'); instructive example: -SELECT "Alias", "Description", "Token" FROM ts_debug('http://foo.com/stuff/index.html'); - Alias | Description | Token +SELECT alias, description, token FROM ts_debug('http://foo.com/stuff/index.html'); + alias | description | token ----------+---------------+-------------------------- protocol | Protocol head | http:// url | URL | foo.com/stuff/index.html @@ -2186,25 +2186,23 @@ SELECT ts_lexize('public.simple_dict','The'); synonym dictionary and put it before the english_stem dictionary: -SELECT * FROM ts_debug('english','Paris'); - Alias | Description | Token | Dictionaries | Lexized token --------+-------------+-------+----------------+---------------------- - lword | Latin word | Paris | {english_stem} | english_stem: {pari} -(1 row) +SELECT * FROM ts_debug('english', 'Paris'); + alias | description | token | dictionaries | dictionary | lexemes +-------+-------------+-------+----------------+--------------+--------- + lword | Latin word | Paris | {english_stem} | english_stem | {pari} -CREATE TEXT SEARCH DICTIONARY synonym ( +CREATE TEXT SEARCH DICTIONARY my_synonym ( TEMPLATE = synonym, SYNONYMS = my_synonyms ); ALTER TEXT SEARCH CONFIGURATION english - ALTER MAPPING FOR lword WITH synonym, english_stem; + ALTER MAPPING FOR lword WITH my_synonym, english_stem; -SELECT * FROM ts_debug('english','Paris'); - Alias | Description | Token | Dictionaries | Lexized token --------+-------------+-------+------------------------+------------------ - lword | Latin word | Paris | {synonym,english_stem} | synonym: {paris} -(1 row) +SELECT * FROM ts_debug('english', 'Paris'); + alias | description | token | dictionaries | dictionary | lexemes +-------+-------------+-------+---------------------------+------------+--------- + lword | Latin word | Paris | {my_synonym,english_stem} | my_synonym | {paris} @@ -2711,7 +2709,14 @@ SHOW default_text_search_config; - ts_debug( config regconfig, document text) returns setof ts_debug + ts_debug( config regconfig, document text, + OUT alias text, + OUT description text, + OUT token text, + OUT dictionaries regdictionary[], + OUT dictionary regdictionary, + OUT lexemes text[]) + returns setof record @@ -2725,23 +2730,47 @@ SHOW default_text_search_config; - ts_debug's result row type is defined as: + ts_debug returns one row for each token identified in the text + by the parser. The columns returned are - -CREATE TYPE ts_debug AS ( - "Alias" text, - "Description" text, - "Token" text, - "Dictionaries" regdictionary[], - "Lexized token" text -); - - - One row is produced for each token identified by the parser. - The first three columns describe the token, and the fourth lists - the dictionaries selected by the configuration for that token's type. - The last column shows the result of dictionary processing: which - dictionary (if any) recognized the token, and what it produced. + + + + alias text — short name of the token type + + + + + description text — description of the + token type + + + + + token text — text of the token + + + + + dictionaries regdictionary[] — the + dictionaries selected by the configuration for this token type + + + + + dictionary regdictionary — the dictionary + that recognized the token, or NULL if none did + + + + + lexemes text[] — the lexeme(s) produced + by the dictionary that recognized the token, or NULL if + none did; an empty array ({}) means it was recognized as a + stop word + + + @@ -2749,33 +2778,32 @@ CREATE TYPE ts_debug AS ( SELECT * FROM ts_debug('english','a fat cat sat on a mat - it ate a fat rats'); - Alias | Description | Token | Dictionaries | Lexized token --------+---------------+-------+--------------+---------------- - lword | Latin word | a | {english} | english: {} - blank | Space symbols | | | - lword | Latin word | fat | {english} | english: {fat} - blank | Space symbols | | | - lword | Latin word | cat | {english} | english: {cat} - blank | Space symbols | | | - lword | Latin word | sat | {english} | english: {sat} - blank | Space symbols | | | - lword | Latin word | on | {english} | english: {} - blank | Space symbols | | | - lword | Latin word | a | {english} | english: {} - blank | Space symbols | | | - lword | Latin word | mat | {english} | english: {mat} - blank | Space symbols | | | - blank | Space symbols | - | | - lword | Latin word | it | {english} | english: {} - blank | Space symbols | | | - lword | Latin word | ate | {english} | english: {ate} - blank | Space symbols | | | - lword | Latin word | a | {english} | english: {} - blank | Space symbols | | | - lword | Latin word | fat | {english} | english: {fat} - blank | Space symbols | | | - lword | Latin word | rats | {english} | english: {rat} - (24 rows) + alias | description | token | dictionaries | dictionary | lexemes +-------+---------------+-------+----------------+--------------+--------- + lword | Latin word | a | {english_stem} | english_stem | {} + blank | Space symbols | | {} | | + lword | Latin word | fat | {english_stem} | english_stem | {fat} + blank | Space symbols | | {} | | + lword | Latin word | cat | {english_stem} | english_stem | {cat} + blank | Space symbols | | {} | | + lword | Latin word | sat | {english_stem} | english_stem | {sat} + blank | Space symbols | | {} | | + lword | Latin word | on | {english_stem} | english_stem | {} + blank | Space symbols | | {} | | + lword | Latin word | a | {english_stem} | english_stem | {} + blank | Space symbols | | {} | | + lword | Latin word | mat | {english_stem} | english_stem | {mat} + blank | Space symbols | | {} | | + blank | Space symbols | - | {} | | + lword | Latin word | it | {english_stem} | english_stem | {} + blank | Space symbols | | {} | | + lword | Latin word | ate | {english_stem} | english_stem | {ate} + blank | Space symbols | | {} | | + lword | Latin word | a | {english_stem} | english_stem | {} + blank | Space symbols | | {} | | + lword | Latin word | fat | {english_stem} | english_stem | {fat} + blank | Space symbols | | {} | | + lword | Latin word | rats | {english_stem} | english_stem | {rat} @@ -2801,34 +2829,33 @@ ALTER TEXT SEARCH CONFIGURATION public.english SELECT * FROM ts_debug('public.english','The Brightest supernovaes'); - Alias | Description | Token | Dictionaries | Lexized token --------+---------------+-------------+-------------------------------------------------+------------------------------------- - lword | Latin word | The | {public.english_ispell,pg_catalog.english_stem} | public.english_ispell: {} - blank | Space symbols | | | - lword | Latin word | Brightest | {public.english_ispell,pg_catalog.english_stem} | public.english_ispell: {bright} - blank | Space symbols | | | - lword | Latin word | supernovaes | {public.english_ispell,pg_catalog.english_stem} | pg_catalog.english_stem: {supernova} -(5 rows) + alias | description | token | dictionaries | dictionary | lexemes +-------+---------------+-------------+-------------------------------+----------------+------------- + lword | Latin word | The | {english_ispell,english_stem} | english_ispell | {} + blank | Space symbols | | {} | | + lword | Latin word | Brightest | {english_ispell,english_stem} | english_ispell | {bright} + blank | Space symbols | | {} | | + lword | Latin word | supernovaes | {english_ispell,english_stem} | english_stem | {supernova} In this example, the word Brightest was recognized by the parser as a Latin word (alias lword). For this token type the dictionary list is - public.english_ispell and - pg_catalog.english_stem. The word was recognized by - public.english_ispell, which reduced it to the noun + english_ispell and + english_stem. The word was recognized by + english_ispell, which reduced it to the noun bright. The word supernovaes is - unknown to the public.english_ispell dictionary so it + unknown to the english_ispell dictionary so it was passed to the next dictionary, and, fortunately, was recognized (in - fact, public.english_stem is a Snowball dictionary which + fact, english_stem is a Snowball dictionary which recognizes everything; that is why it was placed at the end of the dictionary list). The word The was recognized by the - public.english_ispell dictionary as a stop word (english_ispell dictionary as a stop word () and will not be indexed. The spaces are discarded too, since the configuration provides no dictionaries at all for them. @@ -2839,16 +2866,15 @@ SELECT * FROM ts_debug('public.english','The Brightest supernovaes'); you want to see: -SELECT "Alias", "Token", "Lexized token" +SELECT alias, token, dictionary, lexemes FROM ts_debug('public.english','The Brightest supernovaes'); - Alias | Token | Lexized token --------+-------------+-------------------------------------- - lword | The | public.english_ispell: {} - blank | | - lword | Brightest | public.english_ispell: {bright} - blank | | - lword | supernovaes | pg_catalog.english_stem: {supernova} -(5 rows) + alias | token | dictionary | lexemes +-------+-------------+----------------+------------- + lword | The | english_ispell | {} + blank | | | + lword | Brightest | english_ispell | {bright} + blank | | | + lword | supernovaes | english_stem | {supernova} diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 5e557efef4..1f1d983573 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -3,7 +3,7 @@ * * Copyright (c) 1996-2007, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/backend/catalog/system_views.sql,v 1.46 2007/09/25 20:03:37 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/system_views.sql,v 1.47 2007/10/22 20:13:37 tgl Exp $ */ CREATE VIEW pg_roles AS @@ -386,41 +386,39 @@ CREATE VIEW pg_stat_bgwriter AS pg_stat_get_buf_written_backend() AS buffers_backend, pg_stat_get_buf_alloc() AS buffers_alloc; --- Tsearch debug function. Defined here because it'd be pretty unwieldy +-- Tsearch debug function. Defined here because it'd be pretty unwieldy -- to put it into pg_proc.h -CREATE TYPE ts_debug AS ( - "Alias" text, - "Description" text, - "Token" text, - "Dictionaries" regdictionary[], - "Lexized token" text -); - -COMMENT ON TYPE ts_debug IS 'type returned from ts_debug() function'; - -CREATE FUNCTION ts_debug(regconfig, text) -RETURNS SETOF ts_debug AS +CREATE FUNCTION ts_debug(IN config regconfig, IN document text, + OUT alias text, + OUT description text, + OUT token text, + OUT dictionaries regdictionary[], + OUT dictionary regdictionary, + OUT lexemes text[]) +RETURNS SETOF record AS $$ SELECT - tt.alias AS "Alias", - tt.description AS "Description", - parse.token AS "Token", + tt.alias AS alias, + tt.description AS description, + parse.token AS token, ARRAY ( SELECT m.mapdict::pg_catalog.regdictionary FROM pg_catalog.pg_ts_config_map AS m WHERE m.mapcfg = $1 AND m.maptokentype = parse.tokid ORDER BY m.mapseqno ) - AS "Dictionaries", - ( - SELECT - dl.mapdict::pg_catalog.regdictionary || ': ' || dl.lex::pg_catalog.text - FROM - ( SELECT mapdict, pg_catalog.ts_lexize(mapdict, parse.token) AS lex - FROM pg_catalog.pg_ts_config_map AS m - WHERE m.mapcfg = $1 AND m.maptokentype = parse.tokid - ORDER BY pg_catalog.ts_lexize(mapdict, parse.token) IS NULL, m.mapseqno ) dl - LIMIT 1 - ) AS "Lexized token" + AS dictionaries, + ( SELECT mapdict::pg_catalog.regdictionary + FROM pg_catalog.pg_ts_config_map AS m + WHERE m.mapcfg = $1 AND m.maptokentype = parse.tokid + ORDER BY pg_catalog.ts_lexize(mapdict, parse.token) IS NULL, m.mapseqno + LIMIT 1 + ) AS dictionary, + ( SELECT pg_catalog.ts_lexize(mapdict, parse.token) + FROM pg_catalog.pg_ts_config_map AS m + WHERE m.mapcfg = $1 AND m.maptokentype = parse.tokid + ORDER BY pg_catalog.ts_lexize(mapdict, parse.token) IS NULL, m.mapseqno + LIMIT 1 + ) AS lexemes FROM pg_catalog.ts_parse( (SELECT cfgparser FROM pg_catalog.pg_ts_config WHERE oid = $1 ), $2 ) AS parse, @@ -434,8 +432,14 @@ LANGUAGE SQL STRICT STABLE; COMMENT ON FUNCTION ts_debug(regconfig,text) IS 'debug function for text search configuration'; -CREATE FUNCTION ts_debug(text) -RETURNS SETOF ts_debug AS +CREATE FUNCTION ts_debug(IN document text, + OUT alias text, + OUT description text, + OUT token text, + OUT dictionaries regdictionary[], + OUT dictionary regdictionary, + OUT lexemes text[]) +RETURNS SETOF record AS $$ SELECT * FROM pg_catalog.ts_debug( pg_catalog.get_current_ts_config(), $1); $$ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 467277d8ad..1fa5428a96 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -37,7 +37,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.434 2007/10/19 22:01:45 tgl Exp $ + * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.435 2007/10/22 20:13:37 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 200710192 +#define CATALOG_VERSION_NO 200710221 #endif -- 2.11.0