-
+
Functions and Operators
|
- text
+ bytea
Change encoding using specified conversion name. Conversions
can be defined by CREATE CONVERSION. Also
there are some pre-defined conversion names. See
linkend="conversion-names"> for available conversion
- names.
+ names. The
string must be valid in the
+ source encoding.
convert('PostgreSQL' using iso_8859_1_to_utf8)
'PostgreSQL' in UTF8 (Unicode, 8-bit) encoding
+
+
+
+
|
int
-
ASCII code of the first byte of the argument
+
+
ASCII code of the first character of the argument.
+ For
UTF8 returns the Unicode code point of the character.
+ For other multi-byte encodings. the argument must be a strictly
+
ascii('x')
120
|
chr(int)
text
-
Character with the given ASCII code
+
+ Character with the given code. For
UTF8 the argument is
+ treated as a Unicode code point. For other multi-byte encodings the argument
+ must designate a strictly
ASCII character.
+
chr(65)
A
|
- <
optional>src_encoding name,
+ <parameter>src_encoding name,
- text
+ bytea
Convert string to
dest_encoding.
The original encoding is specified by
-
src_encoding is omitted, database
- encoding is assumed.
+
src_encoding. The
string
+ must be valid in this encoding.
convert( 'text_in_utf8', 'UTF8', 'LATIN1')
text_in_utf8 represented in ISO 8859-1 encoding
+ |
+
+
convert_from(string bytea,
+
+ text
+
+ Convert string to the database encoding.
+ The original encoding is specified by
+
src_encoding. The
string
+ must be valid in this encoding.
+
+ convert_from( 'text_in_utf8', 'UTF8')
+ text_in_utf8 represented in the current database encoding
+
+
+ |
+
+
convert_to(string text,
+
+ text
+
+ Convert string to
dest_encoding.
+
+ convert_to( 'some text', 'UTF8')
+ some text represented in the UTF8 encoding
+
+
|
4
+ |
+ int
+
+ Number of characters in
string in the
+
string must be valid in this encoding.
+
+ length('jose', 'UTF8')
+ 4
+
+
|
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/catalog/pg_conversion.c,v 1.36 2007/02/27 23:48:07 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/catalog/pg_conversion.c,v 1.37 2007/09/18 17:41:17 adunstan Exp $
*
*-------------------------------------------------------------------------
*/
* CONVERT
* USING
*
- * TEXT convert_using(TEXT string, TEXT conversion_name)
+ * BYTEA convert_using(TEXT string, TEXT conversion_name)
+ *
+ * bytea is returned so we don't give a value that is
+ * not valid in the database encoding.
*/
Datum
pg_convert_using(PG_FUNCTION_ARGS)
pfree(result);
pfree(str);
- PG_RETURN_TEXT_P(retval);
+ PG_RETURN_BYTEA_P(retval);
}
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.70 2007/02/27 23:48:08 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.71 2007/09/18 17:41:17 adunstan Exp $
*
*-------------------------------------------------------------------------
*/
*
* Returns the decimal representation of the first character from
* string.
+ * If the string is empty we return 0.
+ * If the database encoding is UTF8, we return the Unicode codepoint.
+ * If the database encoding is any other multi-byte encoding, we
+ * return the value of the first byte if it is an ASCII character
+ * (range 1 .. 127), or raise an error.
+ * For all other encodings we return the value of the first byte,
+ * (range 1..255).
*
********************************************************************/
ascii(PG_FUNCTION_ARGS)
{
text *string = PG_GETARG_TEXT_P(0);
+ int encoding = GetDatabaseEncoding();
+ unsigned char *data;
if (VARSIZE(string) <= VARHDRSZ)
PG_RETURN_INT32(0);
- PG_RETURN_INT32((int32) *((unsigned char *) VARDATA(string)));
+ data = (unsigned char *) VARDATA(string);
+
+ if (encoding == PG_UTF8 && *data > 127)
+ {
+ /* return the code point for Unicode */
+
+ int result = 0, tbytes = 0, i;
+
+ if (*data >= 0xF0)
+ {
+ result = *data & 0x07;
+ tbytes = 3;
+ }
+ else if (*data >= 0xE0)
+ {
+ result = *data & 0x0F;
+ tbytes = 2;
+ }
+ else
+ {
+ Assert (*data > 0xC0);
+ result = *data & 0x1f;
+ tbytes = 1;
+ }
+
+ Assert (tbytes > 0);
+
+ for (i = 1; i <= tbytes; i++)
+ {
+ Assert ((data[i] & 0xC0) == 0x80);
+ result = (result << 6) + (data[i] & 0x3f);
+ }
+
+ PG_RETURN_INT32(result);
+ }
+ else
+ {
+ if (pg_encoding_max_length(encoding) > 1 && *data > 127)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("requested character too large")));
+
+
+ PG_RETURN_INT32((int32) *data);
+ }
}
/********************************************************************
*
* Purpose:
*
- * Returns the character having the binary equivalent to val
+ * Returns the character having the binary equivalent to val.
+ *
+ * For UTF8 we treat the argumwent as a Unicode code point.
+ * For other multi-byte encodings we raise an error for arguments
+ * outside the strict ASCII range (1..127).
+ *
+ * It's important that we don't ever return a value that is not valid
+ * in the database encoding, so that this doesn't become a way for
+ * invalid data to enter the database.
*
********************************************************************/
Datum
chr(PG_FUNCTION_ARGS)
{
- int32 cvalue = PG_GETARG_INT32(0);
+ uint32 cvalue = PG_GETARG_UINT32(0);
text *result;
+ int encoding = GetDatabaseEncoding();
+
+ if (encoding == PG_UTF8 && cvalue > 127)
+ {
+ /* for Unicode we treat the argument as a code point */
+ int bytes ;
+ char *wch;
- result = (text *) palloc(VARHDRSZ + 1);
- SET_VARSIZE(result, VARHDRSZ + 1);
- *VARDATA(result) = (char) cvalue;
+ /* We only allow valid Unicode code points */
+ if (cvalue > 0x001fffff)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("requested character too large for encoding: %d",
+ cvalue)));
+
+ if (cvalue > 0xffff)
+ bytes = 4;
+ else if (cvalue > 0x07ff)
+ bytes = 3;
+ else
+ bytes = 2;
+
+ result = (text *) palloc(VARHDRSZ + bytes);
+ SET_VARSIZE(result, VARHDRSZ + bytes);
+ wch = VARDATA(result);
+
+ if (bytes == 2)
+ {
+ wch[0] = 0xC0 | ((cvalue >> 6) & 0x1F);
+ wch[1] = 0x80 | (cvalue & 0x3F);;
+ }
+ else if (bytes == 3)
+ {
+ wch[0] = 0xE0 | ((cvalue >> 12) & 0x0F);
+ wch[1] = 0x80 | ((cvalue >> 6) & 0x3F);
+ wch[2] = 0x80 | (cvalue & 0x3F);
+ }
+ else
+ {
+ wch[0] = 0xF0 | ((cvalue >> 18) & 0x07);
+ wch[1] = 0x80 | ((cvalue >> 12) & 0x3F);
+ wch[2] = 0x80 | ((cvalue >> 6) & 0x3F);
+ wch[3] = 0x80 | (cvalue & 0x3F);
+ }
+
+ }
+
+ else
+ {
+ bool is_mb;
+
+ /* Error out on arguments that make no sense or that we
+ * can't validly represent in the encoding.
+ */
+
+ if (cvalue == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("null character not permitted")));
+
+ is_mb = pg_encoding_max_length(encoding) > 1;
+
+ if ((is_mb && (cvalue > 255)) || (! is_mb && (cvalue > 127)))
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("requested character too large for encoding: %d",
+ cvalue)));
+
+
+ result = (text *) palloc(VARHDRSZ + 1);
+ SET_VARSIZE(result, VARHDRSZ + 1);
+ *VARDATA(result) = (char) cvalue;
+ }
PG_RETURN_TEXT_P(result);
}
* (currently mule internal code (mic) is used)
* Tatsuo Ishii
*
- * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.63 2007/05/28 16:43:24 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.64 2007/09/18 17:41:17 adunstan Exp $
*/
#include "postgres.h"
}
/*
- * Convert string using encoding_nanme. We assume that string's
- * encoding is same as DB encoding.
+ * Convert string using encoding_name. The source
+ * encoding is the DB encoding.
*
- * TEXT convert(TEXT string, NAME encoding_name) */
+ * BYTEA convert_to(TEXT string, NAME encoding_name) */
Datum
-pg_convert(PG_FUNCTION_ARGS)
+pg_convert_to(PG_FUNCTION_ARGS)
{
Datum string = PG_GETARG_DATUM(0);
Datum dest_encoding_name = PG_GETARG_DATUM(1);
Datum result;
result = DirectFunctionCall3(
- pg_convert2, string, src_encoding_name, dest_encoding_name);
+ pg_convert, string, src_encoding_name, dest_encoding_name);
+
+ /* free memory allocated by namein */
+ pfree((void *) src_encoding_name);
+
+ PG_RETURN_BYTEA_P(result);
+}
+
+/*
+ * Convert string using encoding_name. The destination
+ * encoding is the DB encoding.
+ *
+ * TEXT convert_from(BYTEA string, NAME encoding_name) */
+Datum
+pg_convert_from(PG_FUNCTION_ARGS)
+{
+ Datum string = PG_GETARG_DATUM(0);
+ Datum src_encoding_name = PG_GETARG_DATUM(1);
+ Datum dest_encoding_name = DirectFunctionCall1(
+ namein, CStringGetDatum(DatabaseEncoding->name));
+ Datum result;
+
+ result = DirectFunctionCall3(
+ pg_convert, string, src_encoding_name, dest_encoding_name);
/* free memory allocated by namein */
pfree((void *) src_encoding_name);
}
/*
- * Convert string using encoding_name.
+ * Convert string using encoding_names.
*
- * TEXT convert2(TEXT string, NAME src_encoding_name, NAME dest_encoding_name)
+ * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
*/
Datum
-pg_convert2(PG_FUNCTION_ARGS)
+pg_convert(PG_FUNCTION_ARGS)
{
- text *string = PG_GETARG_TEXT_P(0);
+ bytea *string = PG_GETARG_TEXT_P(0);
char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
int src_encoding = pg_char_to_encoding(src_encoding_name);
char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
int dest_encoding = pg_char_to_encoding(dest_encoding_name);
unsigned char *result;
- text *retval;
+ bytea *retval;
unsigned char *str;
int len;
errmsg("invalid destination encoding name \"%s\"",
dest_encoding_name)));
- /* make sure that source string is null terminated */
+ /* make sure that source string is valid and null terminated */
len = VARSIZE(string) - VARHDRSZ;
+ pg_verify_mbstr(src_encoding,VARDATA(string),len,false);
str = palloc(len + 1);
memcpy(str, VARDATA(string), len);
*(str + len) = '\0';
elog(ERROR, "encoding conversion failed");
/*
- * build text data type structure. we cannot use textin() here, since
- * textin assumes that input string encoding is same as database encoding.
+ * build bytea data type structure.
*/
len = strlen((char *) result) + VARHDRSZ;
retval = palloc(len);
/* free memory if allocated by the toaster */
PG_FREE_IF_COPY(string, 0);
- PG_RETURN_TEXT_P(retval);
+ PG_RETURN_BYTEA_P(retval);
+}
+
+/*
+ * get the length of the string considered as text in the specified
+ * encoding. Raises an error if the data is not valid in that
+ * encoding.
+ *
+ * INT4 length (BYTEA string, NAME src_encoding_name)
+ */
+Datum
+length_in_encoding(PG_FUNCTION_ARGS)
+{
+ bytea *string = PG_GETARG_BYTEA_P(0);
+ char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
+ int src_encoding = pg_char_to_encoding(src_encoding_name);
+ int len = VARSIZE(string) - VARHDRSZ;
+ int retval;
+
+ retval = pg_verify_mbstr_len(src_encoding, VARDATA(string), len, false);
+ PG_RETURN_INT32(retval);
+
}
/*
/*
* conversion functions between pg_wchar and multibyte streams.
* Tatsuo Ishii
- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.63 2007/07/12 21:17:09 tgl Exp $
- *
- * WIN1250 client encoding updated by Pavel Behal
+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.64 2007/09/18 17:41:17 adunstan Exp $
*
*/
/* can be used in either frontend or backend */
bool
pg_verifymbstr(const char *mbstr, int len, bool noError)
{
- return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
+ return
+ pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
}
/*
+ * Verify mbstr to make sure that it is validly encoded in the specified
+ * encoding.
+ *
+ */
+bool
+pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
+{
+ return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
+}
+
+/*
* Verify mbstr to make sure that it is validly encoded in the specified
* encoding.
*
* mbstr is not necessarily zero terminated; length of mbstr is
* specified by len.
*
- * If OK, return TRUE. If a problem is found, return FALSE when noError is
+ * If OK, return length of string in the encoding.
+ * If a problem is found, return -1 when noError is
* true; when noError is false, ereport() a descriptive message.
- */
-bool
-pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
+ */
+int
+pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
{
mbverifier mbverify;
+ int mb_len;
Assert(PG_VALID_ENCODING(encoding));
const char *nullpos = memchr(mbstr, 0, len);
if (nullpos == NULL)
- return true;
+ return len;
if (noError)
- return false;
+ return -1;
report_invalid_encoding(encoding, nullpos, 1);
}
/* fetch function pointer just once */
mbverify = pg_wchar_table[encoding].mbverify;
+
+ mb_len = 0;
while (len > 0)
{
{
if (*mbstr != '\0')
{
+ mb_len++;
mbstr++;
len--;
continue;
}
if (noError)
- return false;
+ return -1;
report_invalid_encoding(encoding, mbstr, len);
}
if (l < 0)
{
if (noError)
- return false;
+ return -1;
report_invalid_encoding(encoding, mbstr, len);
}
mbstr += l;
len -= l;
+ mb_len++;
}
- return true;
+ return mb_len;
}
/*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.424 2007/09/11 03:28:05 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.425 2007/09/18 17:41:17 adunstan Exp $
*
*-------------------------------------------------------------------------
*/
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 200709101
+#define CATALOG_VERSION_NO 200709181
#endif
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.469 2007/09/11 03:28:05 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.470 2007/09/18 17:41:17 adunstan Exp $
*
* NOTES
* The script catalog/genbki.sh reads this file and generates .bki
DATA(insert OID = 810 ( pg_client_encoding PGNSP PGUID 12 1 0 f f t f s 0 19 "" _null_ _null_ _null_ pg_client_encoding - _null_ _null_ ));
DESCR("encoding name of current database");
-DATA(insert OID = 1717 ( convert PGNSP PGUID 12 1 0 f f t f s 2 25 "25 19" _null_ _null_ _null_ pg_convert - _null_ _null_ ));
+DATA(insert OID = 1713 ( length PGNSP PGUID 12 1 0 f f t f s 2 23 "17 19" _null_ _null_ _null_ length_in_encoding - _null_ _null_ ));
+DESCR("length of string in specified encoding");
+
+DATA(insert OID = 1714 ( convert_from PGNSP PGUID 12 1 0 f f t f s 2 25 "17 19" _null_ _null_ _null_ pg_convert_from - _null_ _null_ ));
+DESCR("convert string with specified source encoding name");
+
+DATA(insert OID = 1717 ( convert_to PGNSP PGUID 12 1 0 f f t f s 2 17 "25 19" _null_ _null_ _null_ pg_convert_to - _null_ _null_ ));
DESCR("convert string with specified destination encoding name");
-DATA(insert OID = 1813 ( convert PGNSP PGUID 12 1 0 f f t f s 3 25 "25 19 19" _null_ _null_ _null_ pg_convert2 - _null_ _null_ ));
+DATA(insert OID = 1813 ( convert PGNSP PGUID 12 1 0 f f t f s 3 17 "17 19 19" _null_ _null_ _null_ pg_convert - _null_ _null_ ));
DESCR("convert string with specified encoding names");
-DATA(insert OID = 1619 ( convert_using PGNSP PGUID 12 1 0 f f t f s 2 25 "25 25" _null_ _null_ _null_ pg_convert_using - _null_ _null_ ));
+DATA(insert OID = 1619 ( convert_using PGNSP PGUID 12 1 0 f f t f s 2 17 "25 25" _null_ _null_ _null_ pg_convert_using - _null_ _null_ ));
DESCR("convert string with specified conversion name");
DATA(insert OID = 1264 ( pg_char_to_encoding PGNSP PGUID 12 1 0 f f t f s 1 23 "19" _null_ _null_ _null_ PG_char_to_encoding - _null_ _null_ ));
-/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.72 2007/04/15 10:56:30 ishii Exp $ */
+/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.73 2007/09/18 17:41:17 adunstan Exp $ */
#ifndef PG_WCHAR_H
#define PG_WCHAR_H
extern bool pg_verifymbstr(const char *mbstr, int len, bool noError);
extern bool pg_verify_mbstr(int encoding, const char *mbstr, int len,
bool noError);
+extern int pg_verify_mbstr_len(int encoding, const char *mbstr, int len,
+ bool noError);
extern void report_invalid_encoding(int encoding, const char *mbstr, int len);
extern void report_untranslatable_char(int src_encoding, int dest_encoding,
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.302 2007/09/04 16:41:43 adunstan Exp $
+ * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.303 2007/09/18 17:41:17 adunstan Exp $
*
*-------------------------------------------------------------------------
*/
extern Datum PG_character_set_name(PG_FUNCTION_ARGS);
extern Datum PG_character_set_id(PG_FUNCTION_ARGS);
extern Datum pg_convert(PG_FUNCTION_ARGS);
-extern Datum pg_convert2(PG_FUNCTION_ARGS);
+extern Datum pg_convert_to(PG_FUNCTION_ARGS);
+extern Datum pg_convert_from(PG_FUNCTION_ARGS);
+extern Datum length_in_encoding(PG_FUNCTION_ARGS);
/* format_type.c */
extern Datum format_type(PG_FUNCTION_ARGS);