1 | /************************************************* |
---|
2 | * Perl-Compatible Regular Expressions * |
---|
3 | *************************************************/ |
---|
4 | |
---|
5 | /* PCRE is a library of functions to support regular expressions whose syntax |
---|
6 | and semantics are as close as possible to those of the Perl 5 language. |
---|
7 | |
---|
8 | Written by Philip Hazel |
---|
9 | Copyright (c) 1997-2008 University of Cambridge |
---|
10 | |
---|
11 | ----------------------------------------------------------------------------- |
---|
12 | Redistribution and use in source and binary forms, with or without |
---|
13 | modification, are permitted provided that the following conditions are met: |
---|
14 | |
---|
15 | * Redistributions of source code must retain the above copyright notice, |
---|
16 | this list of conditions and the following disclaimer. |
---|
17 | |
---|
18 | * Redistributions in binary form must reproduce the above copyright |
---|
19 | notice, this list of conditions and the following disclaimer in the |
---|
20 | documentation and/or other materials provided with the distribution. |
---|
21 | |
---|
22 | * Neither the name of the University of Cambridge nor the names of its |
---|
23 | contributors may be used to endorse or promote products derived from |
---|
24 | this software without specific prior written permission. |
---|
25 | |
---|
26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
---|
27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
---|
28 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
---|
29 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
---|
30 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
---|
31 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
---|
32 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
---|
33 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
---|
34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
---|
35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
---|
36 | POSSIBILITY OF SUCH DAMAGE. |
---|
37 | ----------------------------------------------------------------------------- |
---|
38 | */ |
---|
39 | |
---|
40 | |
---|
41 | /* This module contains some fixed tables that are used by more than one of the |
---|
42 | PCRE code modules. The tables are also #included by the pcretest program, which |
---|
43 | uses macros to change their names from _pcre_xxx to xxxx, thereby avoiding name |
---|
44 | clashes with the library. */ |
---|
45 | |
---|
46 | |
---|
47 | #include "pcre_config.h" |
---|
48 | #include "pcre_internal.h" |
---|
49 | |
---|
50 | |
---|
51 | /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that |
---|
52 | the definition is next to the definition of the opcodes in pcre_internal.h. */ |
---|
53 | |
---|
54 | const uschar _pcre_OP_lengths[] = { OP_LENGTHS }; |
---|
55 | |
---|
56 | |
---|
57 | |
---|
58 | /************************************************* |
---|
59 | * Tables for UTF-8 support * |
---|
60 | *************************************************/ |
---|
61 | |
---|
62 | /* These are the breakpoints for different numbers of bytes in a UTF-8 |
---|
63 | character. */ |
---|
64 | |
---|
65 | #ifdef SUPPORT_UTF8 |
---|
66 | |
---|
67 | const int _pcre_utf8_table1[] = |
---|
68 | { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; |
---|
69 | |
---|
70 | const int _pcre_utf8_table1_size = sizeof(_pcre_utf8_table1)/sizeof(int); |
---|
71 | |
---|
72 | /* These are the indicator bits and the mask for the data bits to set in the |
---|
73 | first byte of a character, indexed by the number of additional bytes. */ |
---|
74 | |
---|
75 | const int _pcre_utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; |
---|
76 | const int _pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; |
---|
77 | |
---|
78 | /* Table of the number of extra bytes, indexed by the first byte masked with |
---|
79 | 0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */ |
---|
80 | |
---|
81 | const uschar _pcre_utf8_table4[] = { |
---|
82 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
---|
83 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
---|
84 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
---|
85 | 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; |
---|
86 | |
---|
87 | /* Table to translate from particular type value to the general value. */ |
---|
88 | |
---|
89 | const int _pcre_ucp_gentype[] = { |
---|
90 | ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */ |
---|
91 | ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */ |
---|
92 | ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */ |
---|
93 | ucp_N, ucp_N, ucp_N, /* Nd, Nl, No */ |
---|
94 | ucp_P, ucp_P, ucp_P, ucp_P, ucp_P, /* Pc, Pd, Pe, Pf, Pi */ |
---|
95 | ucp_P, ucp_P, /* Ps, Po */ |
---|
96 | ucp_S, ucp_S, ucp_S, ucp_S, /* Sc, Sk, Sm, So */ |
---|
97 | ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */ |
---|
98 | }; |
---|
99 | |
---|
100 | /* The pcre_utt[] table below translates Unicode property names into type and |
---|
101 | code values. It is searched by binary chop, so must be in collating sequence of |
---|
102 | name. Originally, the table contained pointers to the name strings in the first |
---|
103 | field of each entry. However, that leads to a large number of relocations when |
---|
104 | a shared library is dynamically loaded. A significant reduction is made by |
---|
105 | putting all the names into a single, large string and then using offsets in the |
---|
106 | table itself. Maintenance is more error-prone, but frequent changes to this |
---|
107 | data are unlikely. |
---|
108 | |
---|
109 | July 2008: There is now a script called maint/GenerateUtt.py which can be used |
---|
110 | to generate this data instead of maintaining it entirely by hand. */ |
---|
111 | |
---|
112 | const char _pcre_utt_names[] = |
---|
113 | "Any\0" |
---|
114 | "Arabic\0" |
---|
115 | "Armenian\0" |
---|
116 | "Balinese\0" |
---|
117 | "Bengali\0" |
---|
118 | "Bopomofo\0" |
---|
119 | "Braille\0" |
---|
120 | "Buginese\0" |
---|
121 | "Buhid\0" |
---|
122 | "C\0" |
---|
123 | "Canadian_Aboriginal\0" |
---|
124 | "Carian\0" |
---|
125 | "Cc\0" |
---|
126 | "Cf\0" |
---|
127 | "Cham\0" |
---|
128 | "Cherokee\0" |
---|
129 | "Cn\0" |
---|
130 | "Co\0" |
---|
131 | "Common\0" |
---|
132 | "Coptic\0" |
---|
133 | "Cs\0" |
---|
134 | "Cuneiform\0" |
---|
135 | "Cypriot\0" |
---|
136 | "Cyrillic\0" |
---|
137 | "Deseret\0" |
---|
138 | "Devanagari\0" |
---|
139 | "Ethiopic\0" |
---|
140 | "Georgian\0" |
---|
141 | "Glagolitic\0" |
---|
142 | "Gothic\0" |
---|
143 | "Greek\0" |
---|
144 | "Gujarati\0" |
---|
145 | "Gurmukhi\0" |
---|
146 | "Han\0" |
---|
147 | "Hangul\0" |
---|
148 | "Hanunoo\0" |
---|
149 | "Hebrew\0" |
---|
150 | "Hiragana\0" |
---|
151 | "Inherited\0" |
---|
152 | "Kannada\0" |
---|
153 | "Katakana\0" |
---|
154 | "Kayah_Li\0" |
---|
155 | "Kharoshthi\0" |
---|
156 | "Khmer\0" |
---|
157 | "L\0" |
---|
158 | "L&\0" |
---|
159 | "Lao\0" |
---|
160 | "Latin\0" |
---|
161 | "Lepcha\0" |
---|
162 | "Limbu\0" |
---|
163 | "Linear_B\0" |
---|
164 | "Ll\0" |
---|
165 | "Lm\0" |
---|
166 | "Lo\0" |
---|
167 | "Lt\0" |
---|
168 | "Lu\0" |
---|
169 | "Lycian\0" |
---|
170 | "Lydian\0" |
---|
171 | "M\0" |
---|
172 | "Malayalam\0" |
---|
173 | "Mc\0" |
---|
174 | "Me\0" |
---|
175 | "Mn\0" |
---|
176 | "Mongolian\0" |
---|
177 | "Myanmar\0" |
---|
178 | "N\0" |
---|
179 | "Nd\0" |
---|
180 | "New_Tai_Lue\0" |
---|
181 | "Nko\0" |
---|
182 | "Nl\0" |
---|
183 | "No\0" |
---|
184 | "Ogham\0" |
---|
185 | "Ol_Chiki\0" |
---|
186 | "Old_Italic\0" |
---|
187 | "Old_Persian\0" |
---|
188 | "Oriya\0" |
---|
189 | "Osmanya\0" |
---|
190 | "P\0" |
---|
191 | "Pc\0" |
---|
192 | "Pd\0" |
---|
193 | "Pe\0" |
---|
194 | "Pf\0" |
---|
195 | "Phags_Pa\0" |
---|
196 | "Phoenician\0" |
---|
197 | "Pi\0" |
---|
198 | "Po\0" |
---|
199 | "Ps\0" |
---|
200 | "Rejang\0" |
---|
201 | "Runic\0" |
---|
202 | "S\0" |
---|
203 | "Saurashtra\0" |
---|
204 | "Sc\0" |
---|
205 | "Shavian\0" |
---|
206 | "Sinhala\0" |
---|
207 | "Sk\0" |
---|
208 | "Sm\0" |
---|
209 | "So\0" |
---|
210 | "Sundanese\0" |
---|
211 | "Syloti_Nagri\0" |
---|
212 | "Syriac\0" |
---|
213 | "Tagalog\0" |
---|
214 | "Tagbanwa\0" |
---|
215 | "Tai_Le\0" |
---|
216 | "Tamil\0" |
---|
217 | "Telugu\0" |
---|
218 | "Thaana\0" |
---|
219 | "Thai\0" |
---|
220 | "Tibetan\0" |
---|
221 | "Tifinagh\0" |
---|
222 | "Ugaritic\0" |
---|
223 | "Vai\0" |
---|
224 | "Yi\0" |
---|
225 | "Z\0" |
---|
226 | "Zl\0" |
---|
227 | "Zp\0" |
---|
228 | "Zs\0"; |
---|
229 | |
---|
230 | const ucp_type_table _pcre_utt[] = { |
---|
231 | { 0, PT_ANY, 0 }, |
---|
232 | { 4, PT_SC, ucp_Arabic }, |
---|
233 | { 11, PT_SC, ucp_Armenian }, |
---|
234 | { 20, PT_SC, ucp_Balinese }, |
---|
235 | { 29, PT_SC, ucp_Bengali }, |
---|
236 | { 37, PT_SC, ucp_Bopomofo }, |
---|
237 | { 46, PT_SC, ucp_Braille }, |
---|
238 | { 54, PT_SC, ucp_Buginese }, |
---|
239 | { 63, PT_SC, ucp_Buhid }, |
---|
240 | { 69, PT_GC, ucp_C }, |
---|
241 | { 71, PT_SC, ucp_Canadian_Aboriginal }, |
---|
242 | { 91, PT_SC, ucp_Carian }, |
---|
243 | { 98, PT_PC, ucp_Cc }, |
---|
244 | { 101, PT_PC, ucp_Cf }, |
---|
245 | { 104, PT_SC, ucp_Cham }, |
---|
246 | { 109, PT_SC, ucp_Cherokee }, |
---|
247 | { 118, PT_PC, ucp_Cn }, |
---|
248 | { 121, PT_PC, ucp_Co }, |
---|
249 | { 124, PT_SC, ucp_Common }, |
---|
250 | { 131, PT_SC, ucp_Coptic }, |
---|
251 | { 138, PT_PC, ucp_Cs }, |
---|
252 | { 141, PT_SC, ucp_Cuneiform }, |
---|
253 | { 151, PT_SC, ucp_Cypriot }, |
---|
254 | { 159, PT_SC, ucp_Cyrillic }, |
---|
255 | { 168, PT_SC, ucp_Deseret }, |
---|
256 | { 176, PT_SC, ucp_Devanagari }, |
---|
257 | { 187, PT_SC, ucp_Ethiopic }, |
---|
258 | { 196, PT_SC, ucp_Georgian }, |
---|
259 | { 205, PT_SC, ucp_Glagolitic }, |
---|
260 | { 216, PT_SC, ucp_Gothic }, |
---|
261 | { 223, PT_SC, ucp_Greek }, |
---|
262 | { 229, PT_SC, ucp_Gujarati }, |
---|
263 | { 238, PT_SC, ucp_Gurmukhi }, |
---|
264 | { 247, PT_SC, ucp_Han }, |
---|
265 | { 251, PT_SC, ucp_Hangul }, |
---|
266 | { 258, PT_SC, ucp_Hanunoo }, |
---|
267 | { 266, PT_SC, ucp_Hebrew }, |
---|
268 | { 273, PT_SC, ucp_Hiragana }, |
---|
269 | { 282, PT_SC, ucp_Inherited }, |
---|
270 | { 292, PT_SC, ucp_Kannada }, |
---|
271 | { 300, PT_SC, ucp_Katakana }, |
---|
272 | { 309, PT_SC, ucp_Kayah_Li }, |
---|
273 | { 318, PT_SC, ucp_Kharoshthi }, |
---|
274 | { 329, PT_SC, ucp_Khmer }, |
---|
275 | { 335, PT_GC, ucp_L }, |
---|
276 | { 337, PT_LAMP, 0 }, |
---|
277 | { 340, PT_SC, ucp_Lao }, |
---|
278 | { 344, PT_SC, ucp_Latin }, |
---|
279 | { 350, PT_SC, ucp_Lepcha }, |
---|
280 | { 357, PT_SC, ucp_Limbu }, |
---|
281 | { 363, PT_SC, ucp_Linear_B }, |
---|
282 | { 372, PT_PC, ucp_Ll }, |
---|
283 | { 375, PT_PC, ucp_Lm }, |
---|
284 | { 378, PT_PC, ucp_Lo }, |
---|
285 | { 381, PT_PC, ucp_Lt }, |
---|
286 | { 384, PT_PC, ucp_Lu }, |
---|
287 | { 387, PT_SC, ucp_Lycian }, |
---|
288 | { 394, PT_SC, ucp_Lydian }, |
---|
289 | { 401, PT_GC, ucp_M }, |
---|
290 | { 403, PT_SC, ucp_Malayalam }, |
---|
291 | { 413, PT_PC, ucp_Mc }, |
---|
292 | { 416, PT_PC, ucp_Me }, |
---|
293 | { 419, PT_PC, ucp_Mn }, |
---|
294 | { 422, PT_SC, ucp_Mongolian }, |
---|
295 | { 432, PT_SC, ucp_Myanmar }, |
---|
296 | { 440, PT_GC, ucp_N }, |
---|
297 | { 442, PT_PC, ucp_Nd }, |
---|
298 | { 445, PT_SC, ucp_New_Tai_Lue }, |
---|
299 | { 457, PT_SC, ucp_Nko }, |
---|
300 | { 461, PT_PC, ucp_Nl }, |
---|
301 | { 464, PT_PC, ucp_No }, |
---|
302 | { 467, PT_SC, ucp_Ogham }, |
---|
303 | { 473, PT_SC, ucp_Ol_Chiki }, |
---|
304 | { 482, PT_SC, ucp_Old_Italic }, |
---|
305 | { 493, PT_SC, ucp_Old_Persian }, |
---|
306 | { 505, PT_SC, ucp_Oriya }, |
---|
307 | { 511, PT_SC, ucp_Osmanya }, |
---|
308 | { 519, PT_GC, ucp_P }, |
---|
309 | { 521, PT_PC, ucp_Pc }, |
---|
310 | { 524, PT_PC, ucp_Pd }, |
---|
311 | { 527, PT_PC, ucp_Pe }, |
---|
312 | { 530, PT_PC, ucp_Pf }, |
---|
313 | { 533, PT_SC, ucp_Phags_Pa }, |
---|
314 | { 542, PT_SC, ucp_Phoenician }, |
---|
315 | { 553, PT_PC, ucp_Pi }, |
---|
316 | { 556, PT_PC, ucp_Po }, |
---|
317 | { 559, PT_PC, ucp_Ps }, |
---|
318 | { 562, PT_SC, ucp_Rejang }, |
---|
319 | { 569, PT_SC, ucp_Runic }, |
---|
320 | { 575, PT_GC, ucp_S }, |
---|
321 | { 577, PT_SC, ucp_Saurashtra }, |
---|
322 | { 588, PT_PC, ucp_Sc }, |
---|
323 | { 591, PT_SC, ucp_Shavian }, |
---|
324 | { 599, PT_SC, ucp_Sinhala }, |
---|
325 | { 607, PT_PC, ucp_Sk }, |
---|
326 | { 610, PT_PC, ucp_Sm }, |
---|
327 | { 613, PT_PC, ucp_So }, |
---|
328 | { 616, PT_SC, ucp_Sundanese }, |
---|
329 | { 626, PT_SC, ucp_Syloti_Nagri }, |
---|
330 | { 639, PT_SC, ucp_Syriac }, |
---|
331 | { 646, PT_SC, ucp_Tagalog }, |
---|
332 | { 654, PT_SC, ucp_Tagbanwa }, |
---|
333 | { 663, PT_SC, ucp_Tai_Le }, |
---|
334 | { 670, PT_SC, ucp_Tamil }, |
---|
335 | { 676, PT_SC, ucp_Telugu }, |
---|
336 | { 683, PT_SC, ucp_Thaana }, |
---|
337 | { 690, PT_SC, ucp_Thai }, |
---|
338 | { 695, PT_SC, ucp_Tibetan }, |
---|
339 | { 703, PT_SC, ucp_Tifinagh }, |
---|
340 | { 712, PT_SC, ucp_Ugaritic }, |
---|
341 | { 721, PT_SC, ucp_Vai }, |
---|
342 | { 725, PT_SC, ucp_Yi }, |
---|
343 | { 728, PT_GC, ucp_Z }, |
---|
344 | { 730, PT_PC, ucp_Zl }, |
---|
345 | { 733, PT_PC, ucp_Zp }, |
---|
346 | { 736, PT_PC, ucp_Zs } |
---|
347 | }; |
---|
348 | |
---|
349 | const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table); |
---|
350 | |
---|
351 | #endif /* SUPPORT_UTF8 */ |
---|
352 | |
---|
353 | /* End of pcre_tables.c */ |
---|