/* ********************************************************************** * Copyright (C) 2002-2009, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * File genctd.c */ //-------------------------------------------------------------------- // // Tool for generating CompactTrieDictionary data files (.ctd files). // // Usage: genctd [options] -o output-file.ctd input-file // // options: -v verbose // -? or -h help // // The input file is a plain text file containing words, one per line. // Words end at the first whitespace; lines beginning with whitespace // are ignored. // The file can be encoded as utf-8, or utf-16 (either endian), or // in the default code page (platform dependent.). utf encoded // files must include a BOM. // //-------------------------------------------------------------------- #include "unicode/utypes.h" #include "unicode/uchar.h" #include "unicode/ucnv.h" #include "unicode/uniset.h" #include "unicode/unistr.h" #include "unicode/uclean.h" #include "unicode/udata.h" #include "unicode/putil.h" #include "uoptions.h" #include "unewdata.h" #include "ucmndata.h" #include "rbbidata.h" #include "triedict.h" #include "cmemory.h" #include #include #include U_NAMESPACE_USE static char *progName; static UOption options[]={ UOPTION_HELP_H, /* 0 */ UOPTION_HELP_QUESTION_MARK, /* 1 */ UOPTION_VERBOSE, /* 2 */ { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 3 */ UOPTION_ICUDATADIR, /* 4 */ UOPTION_DESTDIR, /* 5 */ UOPTION_COPYRIGHT, /* 6 */ }; void usageAndDie(int retCode) { printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName); printf("\tRead in word list and write out compact trie dictionary\n" "options:\n" "\t-h or -? or --help this usage text\n" "\t-V or --version show a version message\n" "\t-c or --copyright include a copyright notice\n" "\t-v or --verbose turn on verbose output\n" "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" "\t followed by path, defaults to %s\n" "\t-d or --destdir destination directory, followed by the path\n", u_getDataDirectory()); exit (retCode); } #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO /* dummy UDataInfo cf. udata.h */ static UDataInfo dummyDataInfo = { sizeof(UDataInfo), 0, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, U_SIZEOF_UCHAR, 0, { 0, 0, 0, 0 }, /* dummy dataFormat */ { 0, 0, 0, 0 }, /* dummy formatVersion */ { 0, 0, 0, 0 } /* dummy dataVersion */ }; #else // // Set up the ICU data header, defined in ucmndata.h // DataHeader dh ={ {sizeof(DataHeader), // Struct MappedData 0xda, 0x27}, { // struct UDataInfo sizeof(UDataInfo), // size 0, // reserved U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, U_SIZEOF_UCHAR, 0, // reserved { 0x54, 0x72, 0x44, 0x63 }, // "TrDc" Trie Dictionary { 1, 0, 0, 0 }, // 1.0.0.0 { 0, 0, 0, 0 }, // Irrelevant for this data type }}; #endif //---------------------------------------------------------------------------- // // main for genctd // //---------------------------------------------------------------------------- int main(int argc, char **argv) { UErrorCode status = U_ZERO_ERROR; const char *wordFileName; const char *outFileName; const char *outDir = NULL; const char *copyright = NULL; // // Pick up and check the command line arguments, // using the standard ICU tool utils option handling. // U_MAIN_INIT_ARGS(argc, argv); progName = argv[0]; argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); if(argc<0) { // Unrecognized option fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } if(options[0].doesOccur || options[1].doesOccur) { // -? or -h for help. usageAndDie(0); } if (!options[3].doesOccur || argc < 2) { fprintf(stderr, "input and output file must both be specified.\n"); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } outFileName = options[3].value; wordFileName = argv[1]; if (options[4].doesOccur) { u_setDataDirectory(options[4].value); } status = U_ZERO_ERROR; /* Combine the directory with the file name */ if(options[5].doesOccur) { outDir = options[5].value; } if (options[6].doesOccur) { copyright = U_COPYRIGHT_STRING; } #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO UNewDataMemory *pData; char msg[1024]; /* write message with just the name */ sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); fprintf(stderr, "%s\n", msg); /* write the dummy data file */ pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status); udata_writeBlock(pData, msg, strlen(msg)); udata_finish(pData, &status); return (int)status; #else /* Initialize ICU */ u_init(&status); if (U_FAILURE(status)) { fprintf(stderr, "%s: can not initialize ICU. status = %s\n", argv[0], u_errorName(status)); exit(1); } status = U_ZERO_ERROR; // // Read in the dictionary source file // long result; long wordFileSize; FILE *file; char *wordBufferC; file = fopen(wordFileName, "rb"); if( file == 0 ) { fprintf(stderr, "Could not open file \"%s\"\n", wordFileName); exit(-1); } fseek(file, 0, SEEK_END); wordFileSize = ftell(file); fseek(file, 0, SEEK_SET); wordBufferC = new char[wordFileSize+10]; result = (long)fread(wordBufferC, 1, wordFileSize, file); if (result != wordFileSize) { fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); exit (-1); } wordBufferC[wordFileSize]=0; fclose(file); // // Look for a Unicode Signature (BOM) on the word file // int32_t signatureLength; const char * wordSourceC = wordBufferC; const char* encoding = ucnv_detectUnicodeSignature( wordSourceC, wordFileSize, &signatureLength, &status); if (U_FAILURE(status)) { exit(status); } if(encoding!=NULL ){ wordSourceC += signatureLength; wordFileSize -= signatureLength; } // // Open a converter to take the rule file to UTF-16 // UConverter* conv; conv = ucnv_open(encoding, &status); if (U_FAILURE(status)) { fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } // // Convert the words to UChar. // Preflight first to determine required buffer size. // uint32_t destCap = ucnv_toUChars(conv, NULL, // dest, 0, // destCapacity, wordSourceC, wordFileSize, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); exit(status); }; status = U_ZERO_ERROR; UChar *wordSourceU = new UChar[destCap+1]; ucnv_toUChars(conv, wordSourceU, // dest, destCap+1, wordSourceC, wordFileSize, &status); if (U_FAILURE(status)) { fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); exit(status); }; ucnv_close(conv); // Get rid of the original file buffer delete[] wordBufferC; // Create a MutableTrieDictionary, and loop through all the lines, inserting // words. // First, pick a median character. UChar *current = wordSourceU + (destCap/2); UChar uc = *current++; UnicodeSet breaks; breaks.add(0x000A); // Line Feed breaks.add(0x000D); // Carriage Return breaks.add(0x2028); // Line Separator breaks.add(0x2029); // Paragraph Separator do { // Look for line break while (uc && !breaks.contains(uc)) { uc = *current++; } // Now skip to first non-line-break while (uc && breaks.contains(uc)) { uc = *current++; } } while (uc && (breaks.contains(uc) || u_isspace(uc))); MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status); if (U_FAILURE(status)) { fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } // Now add the words. Words are non-space characters at the beginning of // lines, and must be at least one UChar. current = wordSourceU; UChar *candidate = current; uc = *current++; int32_t length = 0; while (uc) { while (uc && !u_isspace(uc)) { ++length; uc = *current++; } if (length > 0) { mtd->addWord(candidate, length, status); if (U_FAILURE(status)) { fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } } // Find beginning of next line while (uc && !breaks.contains(uc)) { uc = *current++; } while (uc && breaks.contains(uc)) { uc = *current++; } candidate = current-1; length = 0; } // Get rid of the Unicode text buffer delete[] wordSourceU; // Now, create a CompactTrieDictionary from the mutable dictionary CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); if (U_FAILURE(status)) { fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } // Get rid of the MutableTrieDictionary delete mtd; // // Get the binary data from the dictionary. // uint32_t outDataSize = ctd->dataSize(); const uint8_t *outData = (const uint8_t *)ctd->data(); // // Create the output file // size_t bytesWritten; UNewDataMemory *pData; pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status); if(U_FAILURE(status)) { fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n", outFileName, u_errorName(status)); exit(status); } // Write the data itself. udata_writeBlock(pData, outData, outDataSize); // finish up bytesWritten = udata_finish(pData, &status); if(U_FAILURE(status)) { fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status)); exit(status); } if (bytesWritten != outDataSize) { fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); exit(-1); } // Get rid of the CompactTrieDictionary delete ctd; u_cleanup(); printf("genctd: tool completed successfully.\n"); return 0; #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ }