Qore Programming Language  0.8.11.1
QoreEncoding.h
Go to the documentation of this file.
1 /* -*- mode: c++; indent-tabs-mode: nil -*- */
2 /*
3  QoreEncoding.h
4 
5  Qore Programming Language
6 
7  Copyright (C) 2003 - 2014 David Nichols
8 
9  Permission is hereby granted, free of charge, to any person obtaining a
10  copy of this software and associated documentation files (the "Software"),
11  to deal in the Software without restriction, including without limitation
12  the rights to use, copy, modify, merge, publish, distribute, sublicense,
13  and/or sell copies of the Software, and to permit persons to whom the
14  Software is furnished to do so, subject to the following conditions:
15 
16  The above copyright notice and this permission notice shall be included in
17  all copies or substantial portions of the Software.
18 
19  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25  DEALINGS IN THE SOFTWARE.
26 
27  Note that the Qore library is released under a choice of three open-source
28  licenses: MIT (as above), LGPL 2+, or GPL 2+; see README-LICENSE for more
29  information.
30 */
31 
32 #ifndef _QORE_CHARSET_H
33 
34 #define _QORE_CHARSET_H
35 
41 #include <qore/common.h>
42 #include <qore/QoreThreadLock.h>
43 
44 #include <strings.h>
45 #include <string.h>
46 
47 #include <map>
48 
49 #include <string>
50 
52 typedef qore_size_t (*mbcs_length_t)(const char* str, const char* end, bool &invalid);
53 
55 typedef qore_size_t (*mbcs_end_t)(const char* str, const char* end, qore_size_t num_chars, bool &invalid);
56 
58 typedef qore_size_t (*mbcs_pos_t)(const char* str, const char* ptr, bool &invalid);
59 
61 
65 typedef qore_size_t (*mbcs_charlen_t)(const char* str, qore_size_t valid_len);
66 
67 class ExceptionSink;
68 
70 
80 class QoreEncoding {
81 private:
82  std::string code;
83  std::string desc;
84  mbcs_length_t flength;
85  mbcs_end_t fend;
86  mbcs_pos_t fpos;
87  mbcs_charlen_t fcharlen;
88  unsigned char maxwidth;
89 
90 public:
91  DLLLOCAL QoreEncoding(const char* n_code, const char* n_desc = 0, unsigned char n_maxwidth = 1, mbcs_length_t l = 0, mbcs_end_t e = 0, mbcs_pos_t p = 0, mbcs_charlen_t c = 0) : code(n_code), desc(n_desc ? n_desc : ""), flength(l), fend(e), fpos(p), fcharlen(c), maxwidth(n_maxwidth) {
92  }
93 
94  DLLLOCAL ~QoreEncoding() {
95  }
96 
98 
103  DLLLOCAL qore_size_t getLength(const char* p, const char* end, bool &invalid) const {
104  return flength ? flength(p, end, invalid) : strlen(p);
105  }
106 
108 
113  DLLLOCAL qore_size_t getLength(const char* p, const char* end, ExceptionSink* xsink) const;
114 
116 
122  DLLLOCAL qore_size_t getByteLen(const char* p, const char* end, qore_size_t c, bool& invalid) const {
123  return fend ? fend(p, end, c, invalid) : c;
124  }
125 
127 
133  DLLLOCAL qore_size_t getByteLen(const char* p, const char* end, qore_size_t c, ExceptionSink* xsink) const;
134 
136 
141  DLLLOCAL qore_size_t getCharPos(const char* p, const char* end, bool& invalid) const {
142  return fpos ? fpos(p, end, invalid) : end - p;
143  }
144 
146 
151  DLLLOCAL qore_size_t getCharPos(const char* p, const char* end, ExceptionSink* xsink) const;
152 
154 
159  DLLLOCAL qore_size_t getCharLen(const char* p, qore_size_t valid_len) const {
160  return fcharlen ? fcharlen(p, valid_len) : 1;
161  }
162 
164  DLLLOCAL bool isMultiByte() const {
165  return (bool)flength;
166  }
167 
169  DLLLOCAL const char* getCode() const {
170  return code.c_str();
171  }
172 
174  DLLLOCAL const char* getDesc() const {
175  return desc.empty() ? "<no description available>" : desc.c_str();
176  }
177 
179  DLLLOCAL int getMaxCharWidth() const {
180  return maxwidth;
181  }
182 };
183 
184 // case-insensitive maps for encodings
185 typedef std::map<const char*, QoreEncoding*, class ltcstrcase> encoding_map_t;
186 typedef std::map<const char*, const QoreEncoding*, class ltcstrcase> const_encoding_map_t;
187 
188 class QoreString;
189 
191 
194 private:
195  DLLLOCAL static encoding_map_t emap;
196  DLLLOCAL static const_encoding_map_t amap;
197  DLLLOCAL static class QoreThreadLock mutex;
198 
199  DLLLOCAL static const QoreEncoding* addUnlocked(const char* code, const char* desc, unsigned char maxwidth = 1, mbcs_length_t l = 0, mbcs_end_t e = 0, mbcs_pos_t p = 0, mbcs_charlen_t = 0);
200  DLLLOCAL static const QoreEncoding* findUnlocked(const char* name);
201 
202 public:
204  DLLEXPORT static void addAlias(const QoreEncoding* qcs, const char* alias);
205 
207  DLLEXPORT static const QoreEncoding* findCreate(const char* name);
208 
210  DLLEXPORT static const QoreEncoding* findCreate(const QoreString* str);
211 
213  DLLEXPORT static void showEncodings();
214 
216  DLLEXPORT static void showAliases();
217 
219  DLLEXPORT static const QoreEncoding* add(const char* code, const char* desc = 0, unsigned char maxwidth = 1, mbcs_length_t l = 0, mbcs_end_t e = 0, mbcs_pos_t p = 0, mbcs_charlen_t = 0);
220 
221  DLLLOCAL static void init(const char* def);
222  DLLLOCAL QoreEncodingManager();
223  DLLLOCAL ~QoreEncodingManager();
224 };
225 
226 DLLEXPORT qore_size_t q_get_byte_len(const QoreEncoding* enc, const char* p, const char* end, qore_size_t c, ExceptionSink* xsink);
227 DLLEXPORT qore_size_t q_get_char_len(const QoreEncoding* enc, const char* p, qore_size_t valid_len, ExceptionSink* xsink);
228 
230 DLLEXPORT extern QoreEncodingManager QEM;
231 
232 // builtin character encodings
233 DLLEXPORT extern const QoreEncoding* QCS_DEFAULT,
234  *QCS_USASCII,
235  *QCS_UTF8,
236  *QCS_ISO_8859_1,
237  *QCS_ISO_8859_2,
238  *QCS_ISO_8859_3,
239  *QCS_ISO_8859_4,
240  *QCS_ISO_8859_5,
241  *QCS_ISO_8859_6,
242  *QCS_ISO_8859_7,
243  *QCS_ISO_8859_8,
244  *QCS_ISO_8859_9,
245  *QCS_ISO_8859_10,
246  *QCS_ISO_8859_11,
247  *QCS_ISO_8859_13,
248  *QCS_ISO_8859_14,
249  *QCS_ISO_8859_15,
250  *QCS_ISO_8859_16,
251  *QCS_KOI8_R,
252  *QCS_KOI8_U,
253  *QCS_KOI7;
254 
256 DLLEXPORT qore_size_t q_UTF8_get_char_len(const char* p, qore_size_t valid_len);
257 
258 #endif // _QORE_CHARSET_H
DLLLOCAL const char * getDesc() const
returns the description for the encoding
Definition: QoreEncoding.h:174
qore_size_t(* mbcs_end_t)(const char *str, const char *end, qore_size_t num_chars, bool &invalid)
for multi-byte character set encodings: gives the number of bytes for the number of chars ...
Definition: QoreEncoding.h:55
DLLEXPORT const QoreEncoding * QCS_UTF8
UTF-8 multi-byte encoding (the only multi-byte encoding, all others are single-byte encodings) ...
DLLLOCAL int getMaxCharWidth() const
returns the maximum character width in bytes for the encoding
Definition: QoreEncoding.h:179
defines string encoding functions in Qore
Definition: QoreEncoding.h:80
DLLEXPORT const QoreEncoding * QCS_ISO_8859_8
Hebrew character set.
DLLEXPORT const QoreEncoding * QCS_ISO_8859_1
latin-1, Western European encoding
DLLEXPORT const QoreEncoding * QCS_DEFAULT
the default encoding for the Qore library
DLLLOCAL qore_size_t getLength(const char *p, const char *end, bool &invalid) const
gives the length of the string in characters
Definition: QoreEncoding.h:103
static DLLEXPORT void addAlias(const QoreEncoding *qcs, const char *alias)
adds an alias for an encoding
qore_size_t(* mbcs_charlen_t)(const char *str, qore_size_t valid_len)
for multi-byte encodings: gives the number of total bytes for the character given one or more charact...
Definition: QoreEncoding.h:65
DLLEXPORT const QoreEncoding * QCS_ISO_8859_2
latin-2, Central European encoding
DLLEXPORT const QoreEncoding * QCS_ISO_8859_11
Thai character set.
manages encodings in Qore
Definition: QoreEncoding.h:193
DLLEXPORT const QoreEncoding * QCS_ISO_8859_3
latin-3, Southern European character set
size_t qore_size_t
used for sizes (same range as a pointer)
Definition: common.h:70
DLLEXPORT const QoreEncoding * QCS_ISO_8859_4
latin-4, Northern European character set
static DLLEXPORT void showEncodings()
prints out all valid encodings to stdout
DLLEXPORT const QoreEncoding * QCS_USASCII
ascii encoding
qore_size_t(* mbcs_pos_t)(const char *str, const char *ptr, bool &invalid)
for multi-byte character set encodings: gives the character position of the ptr
Definition: QoreEncoding.h:58
DLLLOCAL const char * getCode() const
returns the string code (ex: "UTF-8") for the encoding
Definition: QoreEncoding.h:169
DLLLOCAL qore_size_t getByteLen(const char *p, const char *end, qore_size_t c, bool &invalid) const
gives the number of bytes for the number of chars in the string or up to the end of the string ...
Definition: QoreEncoding.h:122
DLLEXPORT const QoreEncoding * QCS_ISO_8859_10
latin-6, Nordic character set
Qore's string type supported by the QoreEncoding class.
Definition: QoreString.h:50
DLLEXPORT const QoreEncoding * QCS_KOI8_U
Ukrainian: Kod Obmena Informatsiey, 8 bit.
DLLEXPORT QoreEncodingManager QEM
the QoreEncodingManager object
DLLEXPORT const QoreEncoding * QCS_ISO_8859_9
latin-5, Turkish character set
static DLLEXPORT void showAliases()
prints out all aliases to stdout
static DLLEXPORT const QoreEncoding * findCreate(const char *name)
finds an encoding if it exists (also looks up against alias names) and creates a new one if it doesn'...
DLLEXPORT const QoreEncoding * QCS_KOI7
Russian: Kod Obmena Informatsiey, 7 bit characters.
qore_size_t(* mbcs_length_t)(const char *str, const char *end, bool &invalid)
for multi-byte character set encodings: gives the length of the string in characters ...
Definition: QoreEncoding.h:52
container for holding Qore-language exception information and also for registering a "thread_exit" ca...
Definition: ExceptionSink.h:43
DLLEXPORT const QoreEncoding * QCS_ISO_8859_14
latin-8, Celtic character set
DLLLOCAL qore_size_t getCharPos(const char *p, const char *end, bool &invalid) const
gives the character position (number of characters) starting from the first pointer to the second ...
Definition: QoreEncoding.h:141
DLLEXPORT const QoreEncoding * QCS_ISO_8859_6
Arabic character set.
DLLLOCAL bool isMultiByte() const
returns true if the encoding is a multi-byte encoding
Definition: QoreEncoding.h:164
DLLEXPORT qore_size_t q_UTF8_get_char_len(const char *p, qore_size_t valid_len)
returns the length of the next UTF-8 character or 0 for an encoding error or a negative number if the...
DLLEXPORT const QoreEncoding * QCS_ISO_8859_5
Cyrillic character set.
provides a mutually-exclusive thread lock
Definition: QoreThreadLock.h:49
DLLLOCAL qore_size_t getCharLen(const char *p, qore_size_t valid_len) const
gives the number of total bytes for the character given one or more characters
Definition: QoreEncoding.h:159
static DLLEXPORT const QoreEncoding * add(const char *code, const char *desc=0, unsigned char maxwidth=1, mbcs_length_t l=0, mbcs_end_t e=0, mbcs_pos_t p=0, mbcs_charlen_t=0)
adds a new encoding to the list
DLLEXPORT const QoreEncoding * QCS_ISO_8859_16
latin-10, Southeast European character set
DLLEXPORT const QoreEncoding * QCS_ISO_8859_15
latin-9, Western European with euro symbol
DLLEXPORT const QoreEncoding * QCS_KOI8_R
Russian: Kod Obmena Informatsiey, 8 bit.
DLLEXPORT const QoreEncoding * QCS_ISO_8859_7
Greek character set.
DLLEXPORT const QoreEncoding * QCS_ISO_8859_13
latin-7, Baltic rim character set