The SWORD Project  1.9.0.svnversion
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
utf8transliterator.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * utf8transliterator.cpp - SWFilter descendant to transliterate between
4  * ICU-supported scripts
5  *
6  * $Id: utf8transliterator.cpp 3822 2020-11-03 18:54:47Z scribe $
7  *
8  * Copyright 2001-2013 CrossWire Bible Society (http://www.crosswire.org)
9  * CrossWire Bible Society
10  * P. O. Box 2528
11  * Tempe, AZ 85280-2528
12  *
13  * This program is free software; you can redistribute it and/or modify it
14  * under the terms of the GNU General Public License as published by the
15  * Free Software Foundation version 2.
16  *
17  * This program is distributed in the hope that it will be useful, but
18  * WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * General Public License for more details.
21  *
22  */
23 
24 #ifdef _ICU_
25 
26 #include <stdlib.h>
27 
28 #include <utilstr.h>
29 
30 #include <unicode/ucnv.h>
31 #include <unicode/uchar.h>
32 #include <utf8transliterator.h>
33 #include <swmodule.h>
34 
35 #ifndef _ICUSWORD_
36 #include "unicode/resbund.h"
37 #endif
38 #include <swlog.h>
39 
41 
43  "Off",
44  "Latin",
45  /*
46  "IPA",
47  "Basic Latin",
48  "SBL",
49  "TC",
50  "Beta",
51  "BGreek",
52  "SERA",
53  "Hugoye",
54  "UNGEGN",
55  "ISO",
56  "ALA-LC",
57  "BGN",
58  "Greek",
59  "Hebrew",
60  "Cyrillic",
61  "Arabic",
62  "Syriac",
63  "Katakana",
64  "Hiragana",
65  "Hangul",
66  "Devanagari",
67  "Tamil",
68  "Bengali",
69  "Gurmukhi",
70  "Gujarati",
71  "Oriya",
72  "Telugu",
73  "Kannada",
74  "Malayalam",
75  "Thai",
76  "Georgian",
77  "Armenian",
78  "Ethiopic",
79  "Gothic",
80  "Ugaritic",
81  "Coptic",
82  "Linear B",
83  "Cypriot",
84  "Runic",
85  "Ogham",
86  "Thaana",
87  "Glagolitic",
88  "Cherokee",
89  */
90 };
91 
92 const char UTF8Transliterator::optName[] = "Transliteration";
93 const char UTF8Transliterator::optTip[] = "Transliterates between scripts";
94 
95 #ifdef ICU_CUSTOM_RESOURCE_BUILDING
96 SWTransMap UTF8Transliterator::transMap;
97 
98 #ifndef _ICUSWORD_
99 
100 const char UTF8Transliterator::SW_RB_RULE_BASED_IDS[] = "RuleBasedTransliteratorIDs";
101 const char UTF8Transliterator::SW_RB_RULE[] = "Rule";
102 #ifdef SWICU_DATA
103 const char UTF8Transliterator::SW_RESDATA[] = SWICU_DATA;
104 #else
105 const char UTF8Transliterator::SW_RESDATA[] = "/usr/local/lib/sword/";
106 #endif
107 
108 class SWCharString {
109  public:
110  inline SWCharString(const UnicodeString& str);
111  inline ~SWCharString();
112  inline operator const char*() { return ptr; }
113  private:
114  char buf[128];
115  char* ptr;
116 };
117 SWCharString::SWCharString(const UnicodeString& str) {
118  // TODO This isn't quite right -- we should probably do
119  // preflighting here to determine the real length.
120  if (str.length() >= (int32_t)sizeof(buf)) {
121  ptr = new char[str.length() + 8];
122  } else {
123  ptr = buf;
124  }
125  str.extract(0, 0x7FFFFFFF, ptr, "");
126 }
127 
128 SWCharString::~SWCharString() {
129  if (ptr != buf) {
130  delete[] ptr;
131  }
132 }
133 
134 #endif // _ICUSWORD_
135 #endif // ICU_CUSTOM_RESOURCE_BUILDING
136 
138  option = 0;
139  unsigned long i;
140  for (i = 0; i < NUMTARGETSCRIPTS; i++) {
141  options.push_back(optionstring[i]);
142  }
143 #ifdef ICU_CUSTOM_RESOURCE_BUILDING
144 #ifndef _ICUSWORD_
145  utf8status = U_ZERO_ERROR;
146  Load(utf8status);
147 #endif
148 #endif
149 }
150 
151 
153 }
154 
155 #ifdef ICU_CUSTOM_RESOURCE_BUILDING
156 void UTF8Transliterator::Load(UErrorCode &status)
157 {
158 #ifndef _ICUSWORD_
159  static const char translit_swordindex[] = "translit_swordindex";
160 
161  UResourceBundle *bundle = 0, *transIDs = 0, *colBund = 0;
162  bundle = ures_openDirect(SW_RESDATA, translit_swordindex, &status);
163  if (U_FAILURE(status)) {
164  SWLog::getSystemLog()->logError("UTF8Transliterator: ICU: no resource index to load");
165  SWLog::getSystemLog()->logError("UTF8Transliterator: ICU: status %s", u_errorName(status));
166  return;
167  }
168 
169  transIDs = ures_getByKey(bundle, SW_RB_RULE_BASED_IDS, 0, &status);
170  //UParseError parseError;
171 
172  int32_t row, maxRows;
173  if (U_SUCCESS(status)) {
174  maxRows = ures_getSize(transIDs);
175  for (row = 0; row < maxRows; row++) {
176  colBund = ures_getByIndex(transIDs, row, 0, &status);
177 
178  if (U_SUCCESS(status) && ures_getSize(colBund) == 4) {
179  UnicodeString id = ures_getUnicodeStringByIndex(colBund, 0, &status);
180  UChar type = ures_getUnicodeStringByIndex(colBund, 1, &status).charAt(0);
181  UnicodeString resString = ures_getUnicodeStringByIndex(colBund, 2, &status);
182 SWLOGD("ok so far");
183 
184  if (U_SUCCESS(status)) {
185  switch (type) {
186  case 0x66: // 'f'
187  case 0x69: // 'i'
188  // 'file' or 'internal';
189  // row[2]=resource, row[3]=direction
190  {
191  //UBool visible = (type == 0x0066 /*f*/);
192  UTransDirection dir =
193  (ures_getUnicodeStringByIndex(colBund, 3, &status).charAt(0) ==
194  0x0046 /*F*/) ?
195  UTRANS_FORWARD : UTRANS_REVERSE;
196  //registry->put(id, resString, dir, visible);
197 SWLOGD("instantiating %s ...", resString.getBuffer());
198  registerTrans(id, resString, dir, status);
199 SWLOGD("done.");
200  }
201  break;
202  case 0x61: // 'a'
203  // 'alias'; row[2]=createInstance argument
204  //registry->put(id, resString, TRUE);
205  break;
206  }
207  }
208  else SWLog::getSystemLog()->logError("UTF8Transliterator: ICU: Failed to get resString");
209  }
210  else SWLog::getSystemLog()->logError("UTF8Transliterator: ICU: Failed to get row");
211  ures_close(colBund);
212  }
213  }
214  else {
215  SWLog::getSystemLog()->logError("UTF8Transliterator: ICU: no resource index to load");
216  SWLog::getSystemLog()->logError("UTF8Transliterator: ICU: status %s", u_errorName(status));
217  }
218 
219  ures_close(transIDs);
220  ures_close(bundle);
221 
222 #endif // _ICUSWORD_
223 }
224 
225 void UTF8Transliterator::registerTrans(const UnicodeString& ID, const UnicodeString& resource,
226  UTransDirection dir, UErrorCode &status )
227 {
228 #ifndef _ICUSWORD_
229 SWLOGD("registering ID locally %s", ID.getBuffer());
230  SWTransData swstuff;
231  swstuff.resource = resource;
232  swstuff.dir = dir;
233  SWTransPair swpair;
234  swpair.first = ID;
235  swpair.second = swstuff;
236  transMap.insert(swpair);
237 #endif
238 }
239 
240 bool UTF8Transliterator::checkTrans(const UnicodeString& ID, UErrorCode &status )
241 {
242 #ifndef _ICUSWORD_
243  Transliterator *trans = Transliterator::createInstance(ID, UTRANS_FORWARD, status);
244  if (!U_FAILURE(status)) {
245  // already have it, clean up and return true
246 SWLOGD("already have it %s", ID.getBuffer());
247  delete trans;
248  return true;
249  }
250  status = U_ZERO_ERROR;
251 
252  SWTransMap::iterator swelement;
253  if ((swelement = transMap.find(ID)) != transMap.end()) {
254 SWLOGD("found element in map");
255  SWTransData swstuff = (*swelement).second;
256  UParseError parseError;
257  //UErrorCode status;
258  //std::cout << "unregistering " << ID << std::endl;
259  //Transliterator::unregister(ID);
260 SWLOGD("resource is %s", swstuff.resource.getBuffer());
261 
262  // Get the rules
263  //std::cout << "importing: " << ID << ", " << resource << std::endl;
264  SWCharString ch(swstuff.resource);
265  UResourceBundle *bundle = ures_openDirect(SW_RESDATA, ch, &status);
266  const UnicodeString rules = ures_getUnicodeStringByKey(bundle, SW_RB_RULE, &status);
267  ures_close(bundle);
268  //parser.parse(rules, isReverse ? UTRANS_REVERSE : UTRANS_FORWARD,
269  // parseError, status);
270  if (U_FAILURE(status)) {
271  SWLog::getSystemLog()->logError("UTF8Transliterator: ICU: Failed to get rules");
272  SWLog::getSystemLog()->logError("UTF8Transliterator: ICU: status %s", u_errorName(status));
273  return false;
274  }
275 
276 
277  Transliterator *trans = Transliterator::createFromRules(ID, rules, swstuff.dir,
278  parseError,status);
279  if (U_FAILURE(status)) {
280  SWLog::getSystemLog()->logError("UTF8Transliterator: ICU: Failed to create transliterator");
281  SWLog::getSystemLog()->logError("UTF8Transliterator: ICU: status %s", u_errorName(status));
282  SWLog::getSystemLog()->logError("UTF8Transliterator: ICU: Parse error: line %s", parseError.line);
283  SWLog::getSystemLog()->logError("UTF8Transliterator: ICU: Parse error: offset %d", parseError.offset);
284  SWLog::getSystemLog()->logError("UTF8Transliterator: ICU: Parse error: preContext %s", *parseError.preContext);
285  SWLog::getSystemLog()->logError("UTF8Transliterator: ICU: Parse error: postContext %s", *parseError.postContext);
286  SWLog::getSystemLog()->logError("UTF8Transliterator: ICU: rules were");
287 // SWLog::getSystemLog()->logError((const char *)rules);
288  return false;
289  }
290 
291  Transliterator::registerInstance(trans);
292  return true;
293 
294  //Transliterator *trans = instantiateTrans(ID, swstuff.resource, swstuff.dir, parseError, status);
295  //return trans;
296  }
297  else {
298  return false;
299  }
300 #else
301 return true;
302 #endif // _ICUSWORD_
303 }
304 #endif // ICU_CUSTOM_RESOURCE_BUILDING
305 
306 bool UTF8Transliterator::addTrans(const char* newTrans, SWBuf* transList) {
307 #ifdef ICU_CUSTOM_RESOURCE_BUILDING
308 #ifdef _ICUSWORD_
309  UErrorCode status;
310  if (checkTrans(UnicodeString(newTrans), status)) {
311 #endif
312 #endif // ICU_CUSTOM_RESOURCE_BUILDING
313  *transList += newTrans;
314  *transList += ";";
315  return true;
316 #ifdef ICU_CUSTOM_RESOURCE_BUILDING
317 #ifdef _ICUSWORD_
318  }
319  else {
320  return false;
321  }
322 #endif
323 #endif // ICU_CUSTOM_RESOURCE_BUILDING
324 }
325 
326 
327 icu::Transliterator * UTF8Transliterator::createTrans(const icu::UnicodeString& ID, UTransDirection dir, UErrorCode &status )
328 {
329  icu::Transliterator *trans = icu::Transliterator::createInstance(ID,UTRANS_FORWARD,status);
330  if (U_FAILURE(status)) {
331  delete trans;
332  return NULL;
333  }
334  else {
335  return trans;
336  }
337 }
338 
339 void UTF8Transliterator::setOptionValue(const char *ival)
340 {
341  unsigned char i = option = NUMTARGETSCRIPTS;
342  while (i && stricmp(ival, optionstring[i])) {
343  i--;
344  option = i;
345  }
346 }
347 
349 {
350  return (NUMTARGETSCRIPTS > option) ? optionstring[option] : 0;
351 }
352 
353 char UTF8Transliterator::processText(SWBuf &text, const SWKey *key, const SWModule *module)
354 {
355  if (option) { // if we want transliteration
356  unsigned long i, j;
357  UErrorCode err = U_ZERO_ERROR;
358  UConverter * conv = NULL;
359  conv = ucnv_open("UTF-8", &err);
360  SWBuf ID;
361 
362  bool compat = false;
363 
364  // Convert UTF-8 string to UTF-16 (UChars)
365  j = strlen(text);
366  int32_t len = (j * 2) + 1;
367  UChar *source = new UChar[len];
368  err = U_ZERO_ERROR;
369  len = ucnv_toUChars(conv, source, len, text, j, &err);
370  source[len] = 0;
371 
372  // Figure out which scripts are used in the string
373  unsigned char scripts[NUMSCRIPTS];
374 
375  for (i = 0; i < NUMSCRIPTS; i++) {
376  scripts[i] = false;
377  }
378 
379  for (i = 0; i < (unsigned long)len; i++) {
380  j = ublock_getCode(source[i]);
381  scripts[SE_LATIN] = true;
382  switch (j) {
383  //case UBLOCK_BASIC_LATIN: scripts[SE_LATIN] = true; break;
384  case UBLOCK_GREEK: scripts[SE_GREEK] = true; break;
385  case UBLOCK_HEBREW: scripts[SE_HEBREW] = true; break;
386  case UBLOCK_CYRILLIC: scripts[SE_CYRILLIC] = true; break;
387  case UBLOCK_ARABIC: scripts[SE_ARABIC] = true; break;
388  case UBLOCK_SYRIAC: scripts[SE_SYRIAC] = true; break;
389  case UBLOCK_KATAKANA: scripts[SE_KATAKANA] = true; break;
390  case UBLOCK_HIRAGANA: scripts[SE_HIRAGANA] = true; break;
391  case UBLOCK_HANGUL_SYLLABLES: scripts[SE_HANGUL] = true; break;
392  case UBLOCK_HANGUL_JAMO: scripts[SE_JAMO] = true; break;
393  case UBLOCK_DEVANAGARI: scripts[SE_DEVANAGARI] = true; break;
394  case UBLOCK_TAMIL: scripts[SE_TAMIL] = true; break;
395  case UBLOCK_BENGALI: scripts[SE_BENGALI] = true; break;
396  case UBLOCK_GURMUKHI: scripts[SE_GURMUKHI] = true; break;
397  case UBLOCK_GUJARATI: scripts[SE_GUJARATI] = true; break;
398  case UBLOCK_ORIYA: scripts[SE_ORIYA] = true; break;
399  case UBLOCK_TELUGU: scripts[SE_TELUGU] = true; break;
400  case UBLOCK_KANNADA: scripts[SE_KANNADA] = true; break;
401  case UBLOCK_MALAYALAM: scripts[SE_MALAYALAM] = true; break;
402  case UBLOCK_THAI: scripts[SE_THAI] = true; break;
403  case UBLOCK_GEORGIAN: scripts[SE_GEORGIAN] = true; break;
404  case UBLOCK_ARMENIAN: scripts[SE_ARMENIAN] = true; break;
405  case UBLOCK_ETHIOPIC: scripts[SE_ETHIOPIC] = true; break;
406  case UBLOCK_GOTHIC: scripts[SE_GOTHIC] = true; break;
407  case UBLOCK_UGARITIC: scripts[SE_UGARITIC] = true; break;
408 // case UBLOCK_MEROITIC: scripts[SE_MEROITIC] = true; break;
409  case UBLOCK_LINEAR_B_SYLLABARY: scripts[SE_LINEARB] = true; break;
410  case UBLOCK_CYPRIOT_SYLLABARY: scripts[SE_CYPRIOT] = true; break;
411  case UBLOCK_RUNIC: scripts[SE_RUNIC] = true; break;
412  case UBLOCK_OGHAM: scripts[SE_OGHAM] = true; break;
413  case UBLOCK_THAANA: scripts[SE_THAANA] = true; break;
414  case UBLOCK_GLAGOLITIC: scripts[SE_GLAGOLITIC] = true; break;
415  case UBLOCK_CHEROKEE: scripts[SE_CHEROKEE] = true; break;
416 // case UBLOCK_TENGWAR: scripts[SE_TENGWAR] = true; break;
417 // case UBLOCK_CIRTH: scripts[SE_CIRTH] = true; break;
418  case UBLOCK_CJK_RADICALS_SUPPLEMENT:
419  case UBLOCK_KANGXI_RADICALS:
420  case UBLOCK_IDEOGRAPHIC_DESCRIPTION_CHARACTERS:
421  case UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION:
422  case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
423  case UBLOCK_CJK_UNIFIED_IDEOGRAPHS:
424  scripts[SE_HAN] = true;
425  break;
426  case UBLOCK_CJK_COMPATIBILITY:
427  case UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS:
428  case UBLOCK_CJK_COMPATIBILITY_FORMS:
429  scripts[SE_HAN] = true;
430  compat = true;
431  break;
432  case UBLOCK_HANGUL_COMPATIBILITY_JAMO:
433  scripts[SE_HANGUL] = true;
434  compat = true;
435  break;
436 
437  //default: scripts[SE_LATIN] = true;
438  }
439  }
440  scripts[option] = false; //turn off the reflexive transliteration
441 
442  //return if we have no transliteration to do for this text
443  j = 0;
444  for (i = 0; !j && i < NUMSCRIPTS; i++) {
445  if (scripts[i]) j++;
446  }
447  if (!j) {
448  ucnv_close(conv);
449  return 0;
450  }
451 
452  if (compat) {
453  addTrans("NFKD", &ID);
454  }
455  else {
456  addTrans("NFD", &ID);
457  }
458 
459  //Simple X to Latin transliterators
460  if (scripts[SE_GREEK]) {
461  if (strnicmp (((SWModule*)module)->getLanguage(), "cop", 3)) {
462  if (option == SE_SBL)
463  addTrans("Greek-Latin/SBL", &ID);
464  else if (option == SE_TC)
465  addTrans("Greek-Latin/TC", &ID);
466  else if (option == SE_BETA)
467  addTrans("Greek-Latin/Beta", &ID);
468  else if (option == SE_BGREEK)
469  addTrans("Greek-Latin/BGreek", &ID);
470  else if (option == SE_UNGEGN)
471  addTrans("Greek-Latin/UNGEGN", &ID);
472  else if (option == SE_ISO)
473  addTrans("Greek-Latin/ISO", &ID);
474  else if (option == SE_ALALC)
475  addTrans("Greek-Latin/ALALC", &ID);
476  else if (option == SE_BGN)
477  addTrans("Greek-Latin/BGN", &ID);
478  else if (option == SE_IPA)
479  addTrans("Greek-IPA/Ancient", &ID);
480  else {
481  addTrans("Greek-Latin", &ID);
482  scripts[SE_LATIN] = true;
483  }
484  }
485  else {
486  if (option == SE_SBL)
487  addTrans("Coptic-Latin/SBL", &ID);
488  else if (option == SE_TC)
489  addTrans("Coptic-Latin/TC", &ID);
490  else if (option == SE_BETA)
491  addTrans("Coptic-Latin/Beta", &ID);
492  else if (option == SE_IPA)
493  addTrans("Coptic-IPA", &ID);
494  else {
495  addTrans("Coptic-Latin", &ID);
496  scripts[SE_LATIN] = true;
497  }
498  }
499  }
500  if (scripts[SE_HEBREW]) {
501  if (option == SE_SBL)
502  addTrans("Hebrew-Latin/SBL", &ID);
503  else if (option == SE_TC)
504  addTrans("Hebrew-Latin/TC", &ID);
505  else if (option == SE_BETA)
506  addTrans("Hebrew-Latin/Beta", &ID);
507  else if (option == SE_UNGEGN)
508  addTrans("Hebrew-Latin/UNGEGN", &ID);
509  else if (option == SE_ALALC)
510  addTrans("Hebrew-Latin/ALALC", &ID);
511  else if (option == SE_SYRIAC)
512  addTrans("Hebrew-Syriac", &ID);
513  else {
514  addTrans("Hebrew-Latin", &ID);
515  scripts[SE_LATIN] = true;
516  }
517  }
518  if (scripts[SE_CYRILLIC]) {
519  if (option == SE_GLAGOLITIC)
520  addTrans("Cyrillic-Glagolitic", &ID);
521  else {
522  addTrans("Cyrillic-Latin", &ID);
523  scripts[SE_LATIN] = true;
524  }
525  }
526  if (scripts[SE_ARABIC]) {
527  addTrans("Arabic-Latin", &ID);
528  scripts[SE_LATIN] = true;
529  }
530  if (scripts[SE_SYRIAC]) {
531  if (option == SE_TC)
532  addTrans("Syriac-Latin/TC", &ID);
533  else if (option == SE_BETA)
534  addTrans("Syriac-Latin/Beta", &ID);
535  else if (option == SE_HUGOYE)
536  addTrans("Syriac-Latin/Hugoye", &ID);
537  else if (option == SE_HEBREW)
538  addTrans("Syriac-Hebrew", &ID);
539  else {
540  addTrans("Syriac-Latin", &ID);
541  scripts[SE_LATIN] = true;
542  }
543  }
544  if (scripts[SE_THAI]) {
545  addTrans("Thai-Latin", &ID);
546  scripts[SE_LATIN] = true;
547  }
548  if (scripts[SE_GEORGIAN]) {
549  if (option == SE_ISO)
550  addTrans("Georgian-Latin/ISO", &ID);
551  else if (option == SE_ALALC)
552  addTrans("Georgian-Latin/ALALC", &ID);
553  else if (option == SE_BGN)
554  addTrans("Georgian-Latin/BGN", &ID);
555  else if (option == SE_IPA)
556  addTrans("Georgian-IPA", &ID);
557  else {
558  addTrans("Georgian-Latin", &ID);
559  scripts[SE_LATIN] = true;
560  }
561  }
562  if (scripts[SE_ARMENIAN]) {
563  if (option == SE_ISO)
564  addTrans("Armenian-Latin/ISO", &ID);
565  else if (option == SE_ALALC)
566  addTrans("Armenian-Latin/ALALC", &ID);
567  else if (option == SE_BGN)
568  addTrans("Armenian-Latin/BGN", &ID);
569  else if (option == SE_IPA)
570  addTrans("Armenian-IPA", &ID);
571  else {
572  addTrans("Armenian-Latin", &ID);
573  scripts[SE_LATIN] = true;
574  }
575  }
576  if (scripts[SE_ETHIOPIC]) {
577  if (option == SE_UNGEGN)
578  addTrans("Ethiopic-Latin/UNGEGN", &ID);
579  else if (option == SE_ISO)
580  addTrans("Ethiopic-Latin/ISO", &ID);
581  else if (option == SE_ALALC)
582  addTrans("Ethiopic-Latin/ALALC", &ID);
583  else if (option == SE_SERA)
584  addTrans("Ethiopic-Latin/SERA", &ID);
585  else {
586  addTrans("Ethiopic-Latin", &ID);
587  scripts[SE_LATIN] = true;
588  }
589  }
590  if (scripts[SE_GOTHIC]) {
591  if (option == SE_BASICLATIN)
592  addTrans("Gothic-Latin/Basic", &ID);
593  else if (option == SE_IPA)
594  addTrans("Gothic-IPA", &ID);
595  else {
596  addTrans("Gothic-Latin", &ID);
597  scripts[SE_LATIN] = true;
598  }
599  }
600  if (scripts[SE_UGARITIC]) {
601  if (option == SE_SBL)
602  addTrans("Ugaritic-Latin/SBL", &ID);
603  else {
604  addTrans("Ugaritic-Latin", &ID);
605  scripts[SE_LATIN] = true;
606  }
607  }
608  if (scripts[SE_MEROITIC]) {
609  addTrans("Meroitic-Latin", &ID);
610  scripts[SE_LATIN] = true;
611  }
612  if (scripts[SE_LINEARB]) {
613  addTrans("LinearB-Latin", &ID);
614  scripts[SE_LATIN] = true;
615  }
616  if (scripts[SE_CYPRIOT]) {
617  addTrans("Cypriot-Latin", &ID);
618  scripts[SE_LATIN] = true;
619  }
620  if (scripts[SE_RUNIC]) {
621  addTrans("Runic-Latin", &ID);
622  scripts[SE_LATIN] = true;
623  }
624  if (scripts[SE_OGHAM]) {
625  addTrans("Ogham-Latin", &ID);
626  scripts[SE_LATIN] = true;
627  }
628  if (scripts[SE_THAANA]) {
629  if (option == SE_ALALC)
630  addTrans("Thaana-Latin/ALALC", &ID);
631  else if (option == SE_BGN)
632  addTrans("Thaana-Latin/BGN", &ID);
633  else {
634  addTrans("Thaana-Latin", &ID);
635  scripts[SE_LATIN] = true;
636  }
637  }
638  if (scripts[SE_GLAGOLITIC]) {
639  if (option == SE_ISO)
640  addTrans("Glagolitic-Latin/ISO", &ID);
641  else if (option == SE_ALALC)
642  addTrans("Glagolitic-Latin/ALALC", &ID);
643  else if (option == SE_ALALC)
644  addTrans("Glagolitic-Cyrillic", &ID);
645  else {
646  addTrans("Glagolitic-Latin", &ID);
647  scripts[SE_LATIN] = true;
648  }
649  }
650  if (scripts[SE_CHEROKEE]) {
651  addTrans("Cherokee-Latin", &ID);
652  scripts[SE_LATIN] = true;
653  }
654  if (scripts[SE_THAI]) {
655  addTrans("Thai-Latin", &ID);
656  scripts[SE_LATIN] = true;
657  }
658  if (scripts[SE_THAI]) {
659  addTrans("Thai-Latin", &ID);
660  scripts[SE_LATIN] = true;
661  }
662 
663  if (scripts[SE_HAN]) {
664  if (!strnicmp (((SWModule*)module)->getLanguage(), "ja", 2)) {
665  addTrans("Kanji-Romaji", &ID);
666  }
667  else {
668  addTrans("Han-Latin", &ID);
669  }
670  scripts[SE_LATIN] = true;
671  }
672 
673  // Inter-Kana and Kana to Latin transliterators
674  if (option == SE_HIRAGANA && scripts[SE_KATAKANA]) {
675  addTrans("Katakana-Hiragana", &ID);
676  scripts[SE_HIRAGANA] = true;
677  }
678  else if (option == SE_KATAKANA && scripts[SE_HIRAGANA]) {
679  addTrans("Hiragana-Katakana", &ID);
680  scripts[SE_KATAKANA] = true;
681  }
682  else {
683  if (scripts[SE_KATAKANA]) {
684  addTrans("Katakana-Latin", &ID);
685  scripts[SE_LATIN] = true;
686  }
687  if (scripts[SE_HIRAGANA]) {
688  addTrans("Hiragana-Latin", &ID);
689  scripts[SE_LATIN] = true;
690  }
691  }
692 
693  // Korean to Latin transliterators
694  if (scripts[SE_HANGUL]) {
695  addTrans("Hangul-Latin", &ID);
696  scripts[SE_LATIN] = true;
697  }
698  if (scripts[SE_JAMO]) {
699  addTrans("Jamo-Latin", &ID);
700  scripts[SE_LATIN] = true;
701  }
702 
703  // Indic-Latin
704  if (option < SE_DEVANAGARI || option > SE_MALAYALAM) {
705  // Indic to Latin
706  if (scripts[SE_TAMIL]) {
707  addTrans("Tamil-Latin", &ID);
708  scripts[SE_LATIN] = true;
709  }
710  if (scripts[SE_BENGALI]) {
711  addTrans("Bengali-Latin", &ID);
712  scripts[SE_LATIN] = true;
713  }
714  if (scripts[SE_GURMUKHI]) {
715  addTrans("Gurmukhi-Latin", &ID);
716  scripts[SE_LATIN] = true;
717  }
718  if (scripts[SE_GUJARATI]) {
719  addTrans("Gujarati-Latin", &ID);
720  scripts[SE_LATIN] = true;
721  }
722  if (scripts[SE_ORIYA]) {
723  addTrans("Oriya-Latin", &ID);
724  scripts[SE_LATIN] = true;
725  }
726  if (scripts[SE_TELUGU]) {
727  addTrans("Telugu-Latin", &ID);
728  scripts[SE_LATIN] = true;
729  }
730  if (scripts[SE_KANNADA]) {
731  addTrans("Kannada-Latin", &ID);
732  scripts[SE_LATIN] = true;
733  }
734  if (scripts[SE_MALAYALAM]) {
735  addTrans("Malayalam-Latin", &ID);
736  scripts[SE_LATIN] = true;
737  }
738  }
739  else {
740  if (scripts[SE_LATIN]) {
741  addTrans("Latin-InterIndic", &ID);
742  }
743  if (scripts[SE_DEVANAGARI]) {
744  addTrans("Devanagari-InterIndic", &ID);
745  }
746  if (scripts[SE_TAMIL]) {
747  addTrans("Tamil-InterIndic", &ID);
748  }
749  if (scripts[SE_BENGALI]) {
750  addTrans("Bengali-InterIndic", &ID);
751  }
752  if (scripts[SE_GURMUKHI]) {
753  addTrans("Gurmurkhi-InterIndic", &ID);
754  }
755  if (scripts[SE_GUJARATI]) {
756  addTrans("Gujarati-InterIndic", &ID);
757  }
758  if (scripts[SE_ORIYA]) {
759  addTrans("Oriya-InterIndic", &ID);
760  }
761  if (scripts[SE_TELUGU]) {
762  addTrans("Telugu-InterIndic", &ID);
763  }
764  if (scripts[SE_KANNADA]) {
765  addTrans("Kannada-InterIndic", &ID);
766  }
767  if (scripts[SE_MALAYALAM]) {
768  addTrans("Malayalam-InterIndic", &ID);
769  }
770 
771  switch(option) {
772  case SE_DEVANAGARI:
773  addTrans("InterIndic-Devanagari", &ID);
774  break;
775  case SE_TAMIL:
776  addTrans("InterIndic-Tamil", &ID);
777  break;
778  case SE_BENGALI:
779  addTrans("InterIndic-Bengali", &ID);
780  break;
781  case SE_GURMUKHI:
782  addTrans("InterIndic-Gurmukhi", &ID);
783  break;
784  case SE_GUJARATI:
785  addTrans("InterIndic-Gujarati", &ID);
786  break;
787  case SE_ORIYA:
788  addTrans("InterIndic-Oriya", &ID);
789  break;
790  case SE_TELUGU:
791  addTrans("InterIndic-Telugu", &ID);
792  break;
793  case SE_KANNADA:
794  addTrans("InterIndic-Kannada", &ID);
795  break;
796  case SE_MALAYALAM:
797  addTrans("InterIndic-Malayalam", &ID);
798  break;
799  default:
800  addTrans("InterIndic-Latin", &ID);
801  scripts[SE_LATIN] = true;
802  break;
803  }
804  }
805 
806 // if (scripts[SE_TENGWAR]) {
807 // addTrans("Tengwar-Latin", &ID);
808 // scripts[SE_LATIN] = true;
809 // }
810 // if (scripts[SE_CIRTH]) {
811 // addTrans("Cirth-Latin", &ID);
812 // scripts[SE_LATIN] = true;
813 // }
814 
815  if (scripts[SE_LATIN]) {
816  switch (option) {
817  case SE_GREEK:
818  addTrans("Latin-Greek", &ID);
819  break;
820  case SE_HEBREW:
821  addTrans("Latin-Hebrew", &ID);
822  break;
823  case SE_CYRILLIC:
824  addTrans("Latin-Cyrillic", &ID);
825  break;
826  case SE_ARABIC:
827  addTrans("Latin-Arabic", &ID);
828  break;
829  case SE_SYRIAC:
830  addTrans("Latin-Syriac", &ID);
831  break;
832  case SE_THAI:
833  addTrans("Latin-Thai", &ID);
834  break;
835  case SE_GEORGIAN:
836  addTrans("Latin-Georgian", &ID);
837  break;
838  case SE_ARMENIAN:
839  addTrans("Latin-Armenian", &ID);
840  break;
841  case SE_ETHIOPIC:
842  addTrans("Latin-Ethiopic", &ID);
843  break;
844  case SE_GOTHIC:
845  addTrans("Latin-Gothic", &ID);
846  break;
847  case SE_UGARITIC:
848  addTrans("Latin-Ugaritic", &ID);
849  break;
850  case SE_COPTIC:
851  addTrans("Latin-Coptic", &ID);
852  break;
853  case SE_KATAKANA:
854  addTrans("Latin-Katakana", &ID);
855  break;
856  case SE_HIRAGANA:
857  addTrans("Latin-Hiragana", &ID);
858  break;
859  case SE_JAMO:
860  addTrans("Latin-Jamo", &ID);
861  break;
862  case SE_HANGUL:
863  addTrans("Latin-Hangul", &ID);
864  break;
865  case SE_MEROITIC:
866  addTrans("Latin-Meroitic", &ID);
867  break;
868  case SE_LINEARB:
869  addTrans("Latin-LinearB", &ID);
870  break;
871  case SE_CYPRIOT:
872  addTrans("Latin-Cypriot", &ID);
873  break;
874  case SE_RUNIC:
875  addTrans("Latin-Runic", &ID);
876  break;
877  case SE_OGHAM:
878  addTrans("Latin-Ogham", &ID);
879  break;
880  case SE_THAANA:
881  addTrans("Latin-Thaana", &ID);
882  break;
883  case SE_GLAGOLITIC:
884  addTrans("Latin-Glagolitic", &ID);
885  break;
886  case SE_CHEROKEE:
887  addTrans("Latin-Cherokee", &ID);
888  break;
889 // case SE_TENGWAR:
890 // addTrans("Latin-Tengwar", &ID);
891 // break;
892 // case SE_CIRTH:
893 // addTrans("Latin-Cirth", &ID);
894 // break;
895  }
896  }
897 
898  if (option == SE_BASICLATIN) {
899  addTrans("Any-Latin1", &ID);
900  }
901 
902  addTrans("NFC", &ID);
903 
904  err = U_ZERO_ERROR;
905  icu::Transliterator * trans = createTrans(icu::UnicodeString(ID), UTRANS_FORWARD, err);
906  if (trans && !U_FAILURE(err)) {
907  icu::UnicodeString target = icu::UnicodeString(source);
908  trans->transliterate(target);
909  text.setSize(text.size()*2);
910  len = ucnv_fromUChars(conv, text.getRawData(), text.size(), target.getBuffer(), target.length(), &err);
911  text.setSize(len);
912  delete trans;
913  }
914  ucnv_close(conv);
915  }
916  return 0;
917 }
918 
920 #endif
921 
922 
923 
virtual void setOptionValue(const char *ival)
static const char optTip[]
#define SWORD_NAMESPACE_START
Definition: defs.h:39
Definition: swbuf.h:47
virtual char processText(SWBuf &text, const SWKey *key=0, const SWModule *module=0)
virtual const char * getOptionValue()
static SWLog * getSystemLog()
Definition: swlog.cpp:53
SWText * module
Definition: osis2mod.cpp:105
static const char optName[]
int stricmp(const char *s1, const char *s2)
Definition: utilstr.cpp:194
return NULL
Definition: regex.c:7953
char * getRawData()
Definition: swbuf.h:379
#define NUMSCRIPTS
UTransDirection dir
std::map< const icu::UnicodeString, SWTransData > SWTransMap
unsigned long size() const
Definition: swbuf.h:185
static const char optionstring[NUMTARGETSCRIPTS][16]
#define NUMTARGETSCRIPTS
void logError(const char *fmt,...) const
Definition: swlog.cpp:87
int strnicmp(const char *s1, const char *s2, int len)
Definition: utilstr.cpp:180
std::pair< icu::UnicodeString, SWTransData > SWTransPair
#define SWORD_NAMESPACE_END
Definition: defs.h:40
#define SWLOGD(...)
Definition: defs.h:187
Definition: swkey.h:77
icu::Transliterator * createTrans(const icu::UnicodeString &ID, UTransDirection dir, UErrorCode &status)
bool addTrans(const char *newTrans, SWBuf *transList)
icu::UnicodeString resource
void setSize(unsigned long len)
Definition: swbuf.h:255