src/backend/parser/scansup.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * scansup.c
   4  *        support routines for the lex/flex scanner, used by both the normal
   5  * backend as well as the bootstrap backend
   6  *
   7  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  *
  11  * IDENTIFICATION
  12  *        $PostgreSQL: pgsql/src/backend/parser/scansup.c,v 1.37 2009/01/01 17:23:46 momjian Exp $
  13  *
  14  *-------------------------------------------------------------------------
  15  */
  16 #include "postgres.h"
  17
  18 #include <ctype.h>
  19
  20 #include "parser/scansup.h"
  21 #include "mb/pg_wchar.h"
  22
  23
  24 /* ----------------
  25  *              scanstr
  26  *
  27  * if the string passed in has escaped codes, map the escape codes to actual
  28  * chars
  29  *
  30  * the string returned is palloc'd and should eventually be pfree'd by the
  31  * caller!
  32  * ----------------
  33  */
  34
  35 char *
  36 scanstr(const char *s)
  37 {
  38         char       *newStr;
  39         int                     len,
  40                                 i,
  41                                 j;
  42
  43         if (s == NULL || s[0] == '\0')
  44                 return pstrdup("");
  45
  46         len = strlen(s);
  47
  48         newStr = palloc(len + 1);       /* string cannot get longer */
  49
  50         for (i = 0, j = 0; i < len; i++)
  51         {
  52                 if (s[i] == '\'')
  53                 {
  54                         /*
  55                          * Note: if scanner is working right, unescaped quotes can only
  56                          * appear in pairs, so there should be another character.
  57                          */
  58                         i++;
  59                         newStr[j] = s[i];
  60                 }
  61                 else if (s[i] == '\\')
  62                 {
  63                         i++;
  64                         switch (s[i])
  65                         {
  66                                 case 'b':
  67                                         newStr[j] = '\b';
  68                                         break;
  69                                 case 'f':
  70                                         newStr[j] = '\f';
  71                                         break;
  72                                 case 'n':
  73                                         newStr[j] = '\n';
  74                                         break;
  75                                 case 'r':
  76                                         newStr[j] = '\r';
  77                                         break;
  78                                 case 't':
  79                                         newStr[j] = '\t';
  80                                         break;
  81                                 case '0':
  82                                 case '1':
  83                                 case '2':
  84                                 case '3':
  85                                 case '4':
  86                                 case '5':
  87                                 case '6':
  88                                 case '7':
  89                                         {
  90                                                 int                     k;
  91                                                 long            octVal = 0;
  92
  93                                                 for (k = 0;
  94                                                          s[i + k] >= '0' && s[i + k] <= '7' && k < 3;
  95                                                          k++)
  96                                                         octVal = (octVal << 3) + (s[i + k] - '0');
  97                                                 i += k - 1;
  98                                                 newStr[j] = ((char) octVal);
  99                                         }
 100                                         break;
 101                                 default:
 102                                         newStr[j] = s[i];
 103                                         break;
 104                         }                                       /* switch */
 105                 }                                               /* s[i] == '\\' */
 106                 else
 107                         newStr[j] = s[i];
 108                 j++;
 109         }
 110         newStr[j] = '\0';
 111         return newStr;
 112 }
 113
 114
 115 /*
 116  * downcase_truncate_identifier() --- do appropriate downcasing and
 117  * truncation of an unquoted identifier.  Optionally warn of truncation.
 118  *
 119  * Returns a palloc'd string containing the adjusted identifier.
 120  *
 121  * Note: in some usages the passed string is not null-terminated.
 122  *
 123  * Note: the API of this function is designed to allow for downcasing
 124  * transformations that increase the string length, but we don't yet
 125  * support that.  If you want to implement it, you'll need to fix
 126  * SplitIdentifierString() in utils/adt/varlena.c.
 127  */
 128 char *
 129 downcase_truncate_identifier(const char *ident, int len, bool warn)
 130 {
 131         char       *result;
 132         int                     i;
 133
 134         result = palloc(len + 1);
 135
 136         /*
 137          * SQL99 specifies Unicode-aware case normalization, which we don't yet
 138          * have the infrastructure for.  Instead we use tolower() to provide a
 139          * locale-aware translation.  However, there are some locales where this
 140          * is not right either (eg, Turkish may do strange things with 'i' and
 141          * 'I').  Our current compromise is to use tolower() for characters with
 142          * the high bit set, and use an ASCII-only downcasing for 7-bit
 143          * characters.
 144          */
 145         for (i = 0; i < len; i++)
 146         {
 147                 unsigned char ch = (unsigned char) ident[i];
 148
 149                 if (ch >= 'A' && ch <= 'Z')
 150                         ch += 'a' - 'A';
 151                 else if (IS_HIGHBIT_SET(ch) && isupper(ch))
 152                         ch = tolower(ch);
 153                 result[i] = (char) ch;
 154         }
 155         result[i] = '\0';
 156
 157         if (i >= NAMEDATALEN)
 158                 truncate_identifier(result, i, warn);
 159
 160         return result;
 161 }
 162
 163 /*
 164  * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
 165  *
 166  * The given string is modified in-place, if necessary.  A warning is
 167  * issued if requested.
 168  *
 169  * We require the caller to pass in the string length since this saves a
 170  * strlen() call in some common usages.
 171  */
 172 void
 173 truncate_identifier(char *ident, int len, bool warn)
 174 {
 175         if (len >= NAMEDATALEN)
 176         {
 177                 len = pg_mbcliplen(ident, len, NAMEDATALEN - 1);
 178                 if (warn)
 179                         ereport(NOTICE,
 180                                         (errcode(ERRCODE_NAME_TOO_LONG),
 181                                          errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
 182                                                         ident, len, ident)));
 183                 ident[len] = '\0';
 184         }
 185 }
 186
 187 /*
 188  * scanner_isspace() --- return TRUE if flex scanner considers char whitespace
 189  *
 190  * This should be used instead of the potentially locale-dependent isspace()
 191  * function when it's important to match the lexer's behavior.
 192  *
 193  * In principle we might need similar functions for isalnum etc, but for the
 194  * moment only isspace seems needed.
 195  */
 196 bool
 197 scanner_isspace(char ch)
 198 {
 199         /* This must match scan.l's list of {space} characters */
 200         /* and plpgsql's scan.l as well */
 201         if (ch == ' ' ||
 202                 ch == '\t' ||
 203                 ch == '\n' ||
 204                 ch == '\r' ||
 205                 ch == '\f')
 206                 return true;
 207         return false;
 208 }