/****************************************************************************** * * hebrewmcim.cpp - HebrewMCIM: Keyboard mapping for Michigan-Claremont * Hebrew input * * $Id: hebrewmcim.cpp 2833 2013-06-29 06:40:28Z chrislit $ * * Copyright 2001-2013 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation version 2. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * */ #include HebrewMCIM::HebrewMCIM() :SWInputMethod() { init(); } int *HebrewMCIM::translate(char in) { int retVal = 0; static int retString[5]; int retStringIndex = 0; memset(retString, 0, 5); if (getState() > 1) { if (getState() >= 12) { // serious issue with internal structure setState(0); retString[retStringIndex++] = in; return retString; } map::iterator find = subst2[getState()].find(in); if (find != subst2[getState()].end()) retVal = find->second; else retVal = in; setState(0); retString[retStringIndex++] = retVal; return retString; } else { retVal = subst[in]; if (retVal == 0) { setState(0); retString[retStringIndex++] = in; return retString; } if (retVal > 100) { setState(1); retString[retStringIndex++] = retVal; return retString; } if (retVal == 50) { // multiChar setState(1); int *chars = multiChars[in]; if (chars != 0) { retString[retStringIndex++] = chars[0]; retString[retStringIndex++] = chars[1]; return retString; } } } setState(retVal); return 0; } void HebrewMCIM::init() { memset(subst, 0, 255); subst[')'] = 1488; subst['B'] = 1489; subst['G'] = 1490; subst['D'] = 1491; subst['H'] = 1492; subst['W'] = 1493; subst['Z'] = 1494; subst['X'] = 1495; subst['+'] = 1496; subst['Y'] = 1497; subst['k'] = 1498; // finals subst['m'] = 1501; subst['n'] = 1503; subst['c'] = 1509; subst['P'] = 1508; subst['K'] = 1499; subst['L'] = 1500; subst['M'] = 1502; subst['N'] = 1504; subst['S'] = 1505; subst['('] = 1506; subst['p'] = 1507; subst['C'] = 1510; subst['Q'] = 1511; subst['R'] = 1512; subst['#'] = 1513; // special multiChars subst['&'] = 50; subst['$'] = 50; static int x[] = {1513, 1474}; multiChars['&'] = x; static int y[] = {1513, 1473}; multiChars['$'] = y; subst['T'] = 1514; // VOWELS subst['A'] = 1463; subst['F'] = 1464; subst['E'] = 1462; subst['"'] = 1461; subst['I'] = 1460; subst['O'] = 1465; subst['U'] = 1467; // OTHER DIACRITICS subst['.'] = 1468; subst['-'] = 1470; subst[','] = 1471; // Compound input // CANTILLATION subst[':'] = 2; subst2[2]['A'] = 1458; subst2[2]['E'] = 1457; subst2[2]['F'] = 1459; /* Telisha qetana is postpositive as in '04' above. However, Michigan # code '24' is for a medial telisha. Graphically, there is no # difference. */ subst['2'] = 5; subst2[5]['4'] = 1449; /* Note Michigan encoding distinguishes between medial metheg '35' (occuring # on the left of the vowel), and the ordinary meteg '95' (occuring on the # right of the vowel). It is also used for silluq. */ subst['3'] = 6; subst2[6]['3'] = 1433; subst2[6]['5'] = 1469; /* The Michigan code of telisha gedola in medial position. Graphically, # there is no difference. */ subst['4'] = 7; subst2[7]['4'] = 1440; subst['6'] = 8; subst2[8]['0'] = 1451; subst2[8]['1'] = 1436; subst['1'] = 4; subst2[4]['0'] = 1434; /* In the poetic books, prepositive dehi occurs; it's unclear whether # tipeha also occurs in the poetic books. Otherwise, we could simply # check for what book in the Tanach we are in. Michigan uses the same # code for each. */ subst2[4]['3'] = 1430; /* This is the poetic accent mugrash, which also includes rebia, but is # encoded separately as '81' in the Michigan text. */ subst2[4]['1'] = 1437; subst2[4]['4'] = 1440; subst['0'] = 3; subst2[3]['0'] = 1475; subst2[3]['1'] = 1426; /* According to BHS, zarqa and sinnor are both postpositive. However, # the Michigan encoding uses one code for both. The Unicode zarqa # (0x0598) is definitely NOT postpositive. And further, the shape of # the symbol is different in BHS and Uniocde. This needs further # research to determine what's going on here. For now, we follow BHS # and use the postpositive Unicode zinor or both accents. */ subst2[3]['2'] = 1454; /* Pashta is postpositive, and the Unicode equivalent reflects # this. However, there is a poetic equivalent -- azla legarmeh -- # which is not postpositive, but no equivalent code point exists in # Unicode. The Michigan encoding does not distinguish between the two, # although it could be algorithmically determined. */ subst2[3]['3'] = 1433; subst2[3]['4'] = 1449; subst2[3]['5'] = 1472; /* This is the Unicode Hebrew *accent*; there is also another Hebrew # *punctuation* called GERSHAYIM 0x05F4. I'm using the more # traditional rounded marks, rather than the alternate straight # marks. */ subst2[8]['2'] = 1438; // Also known as azla subst2[8]['3'] = 1448; subst2[8]['4'] = 1452; subst2[8]['5'] = 1427; subst['8'] = 9; subst2[9]['0'] = 1428; subst2[9]['1'] = 1431; /* Note, this accent is actually sinnorit, but it does not exist as a # separate glyph in the Unicode standard. The 'ZINOR' Unicode accent # is postpositive, while sinnorit is not. ZARQA is as close as I can # get to this. */ subst2[9]['2'] = 1432; /* The Unicode form does not match the form used by BHS, but the names # are the same. */ subst2[9]['3'] = 1441; subst2[9]['4'] = 1439; subst2[9]['5'] = 1429; subst['7'] = 10; subst2[10]['0'] = 1444; subst2[10]['1'] = 1445; subst2[10]['2'] = 1446; subst2[10]['3'] = 1430; // also '13', '73' also is used for majela subst2[10]['4'] = 1443; subst2[10]['5'] = 1469; // this is silluq; should appear to the left of the vowel subst['9'] = 11; subst2[11]['1'] = 1435; subst2[11]['2'] = 1425; subst2[11]['3'] = 1450; subst2[11]['4'] = 1447; subst2[11]['5'] = 1469; // should appear to the right of the vowel } /* # CANTILLION MARKS my $ETNAHTA = '֑'; # officially the Unicode name for this symbol was "SEGOL." However, that is # not a unique name, conflicting with the vowel of the same name. Further, # the position of the symbol is different. I have changed the name of the # accent to "SEGOLTA," the traditional name for this accent. my $SEGOLTA = '֒'; my $SHALSHELET = '֓'; my $ZAQEF_QATAN = '֔'; my $ZAQEF_GADOL = '֕'; my $TIPEHA = '֖'; my $REVIA = '֗'; my $ZARQA = '֘'; my $PASHTA = '֙'; my $YETIV = '֚'; my $TEVIR = '֛'; my $GERESH = '֜'; my $GERESH_MUQDAM = '֝'; my $GERSHAYIM = '֞'; my $QARNEY_PARA = '֟'; my $TELISHA_GEDOLA = '֠'; my $PAZER = '֡'; my $MUNAH = '֣'; my $MAHAPAKH = '֤'; my $MERKHA = '֥'; my $MERKHA_KEFULA = '֦'; my $DARGA = '֧'; my $QADMA = '֨'; my $TELISHA_QETANA = '֩'; my $YERAH_BEN_YOMO = '֪'; my $OLE = '֫'; my $ILUY = '֬'; my $DEHI = '֭'; my $ZINOR = '֮'; # HEBREW MARK my $MASORA_CIRCLE = '֯'; # HEBREW EXTENDED-A points and punctuation my $SHEVA = 'ְ'; my $HATAF_SEGOL = 'ֱ'; my $HATAF_PATAH = 'ֲ'; my $HATAF_QAMATS = 'ֳ'; my $HIRIQ = 'ִ'; my $TSERE = 'ֵ'; my $SEGOL = 'ֶ'; # furtive Patah is not a distinct character my $PATAH = 'ַ'; my $QAMATS = 'ָ'; my $HOLAM = 'ֹ'; my $QUBUTS = 'ֻ'; # also used as shuruq # falls within the base letter my $DAGESH_OR_MAPIQ = 'ּ'; # also used as siluq my $METAG = 'ֽ'; my $MAQAF = '־'; my $RAFE = 'ֿ'; # Also used for legarmeh # may be treated as spacing punctuation, not as a point my $PASEQ = '׀'; my $SHIN_DOT = 'ׁ'; my $SIN_DOT = 'ׂ'; my $SOF_PASUQ = '׃'; # HEBREW MARK my $UPPER_DOT = 'ׄ'; # HEBREW LETTERS based on ISO 8859-8 # aleph # x (alef symbol - 2135) my $ALEF = 'א'; # x (bet symbol - 2136) my $BET = 'ב'; # x (gimel symbol - 2137) my $GIMEL = 'ג'; # x (dalet symbol - 2138) my $DALET = 'ד'; my $HE = 'ה'; my $VAV = 'ו'; my $ZAYIN = 'ז'; my $HET = 'ח'; my $TET = 'ט'; my $YOD = 'י'; my $FINAL_KAF = 'ך'; my $KAF = 'כ'; my $LAMED = 'ל'; my $FINAL_MEM = 'ם'; my $MEM = 'מ'; my $FINAL_NUN = 'ן'; my $NUN = 'נ'; my $SAMEKH = 'ס'; my $AYIN = 'ע'; my $FINAL_PE = 'ף'; my $PE = 'פ'; my $FINAL_TSADI = 'ץ'; # also known as zade my $TSADI = 'צ'; my $QOF = 'ק'; my $RESH = 'ר'; my $SHIN = 'ש'; my $TAV = 'ת'; # Yiddish digraphs # Hebrew Ligature # tsvey vovn my $DOUBLE_VAV = 'װ'; my $VAV_YOD = 'ױ'; # tsvey yudn my $DOUBLE_YOD = 'ײ'; # Additional punctuation my $PUNCT_GERESH = '׳'; my $PUNCT_GERSHAYIM = '״'; # Reserved: 0x05F5" # x (hebrew point judeo-spanish varika - FB1E) #my $JUDEO_SPANISH_VARIKA = pack("U",0xFB1E); # UTF-8 OxFB1E ############################# # End of Unicode 2.0 Hebrew # ############################# # A hash whose key is a Michagan code, and whose value is a Unicode # equvalent char subst[] = new char [255]; subst[')'] = 1488; 'B' => $BET, 'G' => $GIMEL, 'D' => $DALET, 'H' => $HE, 'W' => $VAV, 'Z' => $ZAYIN, 'X' => $HET, '+' => $TET, 'Y' => $YOD, 'K' => $KAF, 'L' => $LAMED, 'M' => $MEM, 'N' => $NUN, 'S' => $SAMEKH, '(' => $AYIN, 'P' => $PE, 'C' => $TSADI, 'Q' => $QOF, 'R' => $RESH, '#' => $SHIN, # the letter shin without a point '&' => ($SHIN . $SIN_DOT), '$' => ($SHIN . $SHIN_DOT), # ' 'T' => $TAV, # VOWELS 'A' => $PATAH, 'F' => $QAMATS, 'E' => $SEGOL, '"' => $TSERE, 'I' => $HIRIQ, 'O' => $HOLAM, 'U' => $QUBUTS, ':' => $SHEVA, ':A' => $HATAF_PATAH, ':E' => $HATAF_SEGOL, ':F' => $HATAF_QAMATS, # OTHER DIACRITICS '.' => $DAGESH_OR_MAPIQ, '-' => $MAQAF, ',' => $RAFE, # CANTILLATION '00' => $SOF_PASUQ, '01' => $SEGOLTA, # According to BHS, zarqa and sinnor are both postpositive. However, # the Michigan encoding uses one code for both. The Unicode zarqa # (0x0598) is definitely NOT postpositive. And further, the shape of # the symbol is different in BHS and Uniocde. This needs further # research to determine what's going on here. For now, we follow BHS # and use the postpositive Unicode zinor or both accents. '02' => $ZINOR, # Pashta is postpositive, and the Unicode equivalent reflects # this. However, there is a poetic equivalent -- azla legarmeh -- # which is not postpositive, but no equivalent code point exists in # Unicode. The Michigan encoding does not distinguish between the two, # although it could be algorithmically determined. '03' => $PASHTA, '04' => $TELISHA_QETANA, '05' => $PASEQ, '10' => $YETIV, # In the poetic books, prepositive dehi occurs; it's unclear whether # tipeha also occurs in the poetic books. Otherwise, we could simply # check for what book in the Tanach we are in. Michigan uses the same # code for each. '13' => $TIPEHA, # also $DEHI # This is the poetic accent mugrash, which also includes rebia, but is # encoded separately as '81' in the Michigan text. '11' => $GERESH_MUQDAM, '14' => $TELISHA_GEDOLA, # Telisha qetana is postpositive as in '04' above. However, Michigan # code '24' is for a medial telisha. Graphically, there is no # difference. '24' => $TELISHA_QETANA, '33' => $PASHTA, # The Michigan code of telisha gedola in medial position. Graphically, # there is no difference. '44' => $TELISHA_GEDOLA, '60' => $OLE, '61' => $GERESH, # This is the Unicode Hebrew *accent*; there is also another Hebrew # *punctuation* called GERSHAYIM 0x05F4. I'm using the more # traditional rounded marks, rather than the alternate straight # marks. '62' => $GERSHAYIM, # Also known as azla '63' => $QADMA, '64' => $ILUY, '65' => $SHALSHELET, '80' => $ZAQEF_QATAN, '81' => $REVIA, # Note, this accent is actually sinnorit, but it does not exist as a # separate glyph in the Unicode standard. The 'ZINOR' Unicode accent # is postpositive, while sinnorit is not. ZARQA is as close as I can # get to this. '82' => $ZARQA, # The Unicode form does not match the form used by BHS, but the names # are the same. '83' => $PAZER, '84' => $QARNEY_PARA, '85' => $ZAQEF_GADOL, # Note Michigan encoding distinguishes between medial metheg '35' (occuring # on the left of the vowel), and the ordinary meteg '95' (occuring on the # right of the vowel). It is also used for silluq. '35' => $METAG, '70' => $MAHAPAKH, '71' => $MERKHA, '72' => $MERKHA_KEFULA, '73' => $TIPEHA, # also '13', '73' also is used for majela '74' => $MUNAH, '75' => $METAG, # this is silluq; should appear to the left of the vowel '91' => $TEVIR, '92' => $ETNAHTA, '93' => $YERAH_BEN_YOMO, '94' => $DARGA, '95' => $METAG, # should appear to the right of the vowel # Not used by the Michigan Encoding # $UPPER_DOT = '05C4'; ); # declare other variables my (@bhsLines, @bhsVerse, @entity_line) = (); my ($i, $verse, $word, $character) = 0; my ($element, $saveGuttural) = ""; # read in a line while (<>) { # Process one verse # iterate over every character and change to XML decimal entity CHAR: for ( $i = 0; ($i < scalar(@bhsVerse)); $i++) { # find and convert final kaf, mem, nun, pe, tsade ( # if final form $bhsVerse[$i] =~ /[KMNPC]/ ) && ( ( # whitespace or $bhsVerse[$i+1] =~ /[ \-?]/ ) || ( # EOL or $i == ( scalar(@bhsVerse) - 1 ) ) || ( # sof pasuq or ( $bhsVerse[$i+1] =~ /0/ ) && ( $bhsVerse[$i+2] =~ /0/ ) ) || ( # one accent followed by white, eol or ( ( $bhsVerse[$i+1] =~ /\d/ ) && ( $bhsVerse[$i+2] =~ /\d/ ) ) && ( ( $bhsVerse[$i+3] =~ /[ \-?]/ ) || ( $i == ( scalar(@bhsVerse) - 1 ) ) ) ) || ( # two accents followed by white, eol ( ( $bhsVerse[$i+1] =~ /\d/ ) && ( $bhsVerse[$i+2] =~ /\d/ ) && ( $bhsVerse[$i+3] =~ /\d/ ) && ( $bhsVerse[$i+4] =~ /\d/ ) ) && ( ( $bhsVerse[$i+5] =~ /[ \-?]/ ) || ( $i == ( scalar(@bhsVerse) - 1 ) ) ) ) || ( # followed by a vowel and white, eol, sof pasuq ( $bhsVerse[$i+1] =~ /[:F]/ ) && ( # followed by ( $bhsVerse[$i+2] =~ /[ \-?]/ ) || # whitespace or ( $i == ( scalar(@bhsVerse) - 1 ) ) || # eol or ( # sof pasuq ( $bhsVerse[$i+2] =~ /0/ ) && ( $bhsVerse[$i+3] =~ /0/ ) ) ) ) ) # end of what follows after final letter && do { $bhsVerse[$i] =~ /K/ && eval { push @entity_line,$FINAL_KAF; } && next CHAR; $bhsVerse[$i] =~ /M/ && eval { push @entity_line,$FINAL_MEM; } && next CHAR; $bhsVerse[$i] =~ /N/ && eval { push @entity_line,$FINAL_NUN; } && next CHAR; $bhsVerse[$i] =~ /P/ && eval { push @entity_line,$FINAL_PE; } && next CHAR; $bhsVerse[$i] =~ /C/ && eval { push @entity_line,$FINAL_TSADI; } && next CHAR; }; # find and convert "furtive patach" ( $bhsVerse[$i] =~ /A/ ) && # If the letter is a patach ( $bhsVerse[$i-1] =~ /[)HX(]/ ) && # and is preceeded by a guttural ( ( $bhsVerse[$i-2] =~ /[AEFOU]/ ) || # and is preceeded by a vowel ( ( $bhsVerse[$i-2] =~ /\./ ) && # or by suruq ( $bhsVerse[$i-3] =~ /W/ ) ) || # ( ( $bhsVerse[$i-2] =~ /W/ ) && # or by holem (written plene) ( $bhsVerse[$i-3] =~ /O/ ) ) || # ( ( $bhsVerse[$i-2] =~ /Y/ ) && # or by hiriq-yod ( $bhsVerse[$i-3] =~ /I/ ) ) ) && do { $saveGuttural = pop @entity_line; # snip off the gutteral push @entity_line,$PATAH; # push on the patach push @entity_line,$saveGuttural; # push back on the gutteral next CHAR; }; # convert cantillation # since we have previously dealt with all other cases of # numbers, two digit patterns are all we have to search for $bhsVerse[$i] =~ /\d/ && $bhsVerse[$i+1] =~ /\d/ && do { push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"}; $i++; # accents are two digits long, so advance past the 2nd digit next CHAR; }; # convert katef vowels, which are two characters long $bhsVerse[$i] =~ /:/ && $bhsVerse[$i+1] =~ /[AEF]/ && do { push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"}; $i++; next CHAR; }; # convert everything else push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]"}; } # end CHAR # print the line to standard output with XML character-level encoding # each character has the following format: # Ӓ # set up the verse element $word = 1; $character = 1; print "\n\n"; # print each character element # if there is a space, then close the word entity, open a new word # entity, increment the word number, reset the character number to # zero. foreach $element (@entity_line) { if ( $element =~ " " ) { $word++; $character = 1; print "\n\n"; next; } print "$element\n"; $character++; } # close the verse element print "\n"; # reinitialize variables @bhsVerse = (); @entity_line = (); @bhsLines = (); } # end while # close the XML document print "\n"; */