package org.crosswire.sword.frontend.im;
/**
* Title: Keyboard mapping for Michigan-Claremont Hebrew input
* Description:
* Copyright: Copyright (c) 2001 CrossWire Bible Society under the terms of the GNU GPL
* Company:
* @author Troy A. Griffitts
* @version 1.0
*/
import java.util.*;
public class HebrewMCIM extends SWInputMethod {
char subst[] = new char [255];
Hashtable subst2[] = new Hashtable[12];
Hashtable multiChars = new Hashtable();
public HebrewMCIM(String name) {
super(name);
init();
}
public String translate(char in) {
char retVal = 0;
StringBuffer retString = new StringBuffer();
if (getState() > 1) {
if (getState() >= subst2.length) { // serious issue with internal structure
setState(0);
retString.append(in);
return retString.toString();
}
Integer find = (Integer)subst2[getState()].get(new Integer(in));
if (find != null)
retVal = (char)find.intValue();
else retVal = in;
setState(0);
retString.append(retVal);
return retString.toString();
}
else {
if (in >= subst.length) {
setState(0);
retString.append(in);
return retString.toString();
}
retVal = subst[in];
if (retVal == 0) {
setState(0);
retString.append(in);
return retString.toString();
}
if (retVal > 100) {
setState(1);
retString.append(retVal);
return retString.toString();
}
if (retVal == 50) { // multiChar
setState(1);
Integer[] chars = (Integer[])multiChars.get(new Integer(in));
if (chars != null) {
for (int i = 0; i < chars.length; i++)
retString.append((char)chars[i].intValue());
return retString.toString();
}
}
}
setState(retVal);
return null;
}
private void init() {
for (int i = 0; i < 255; i++)
subst[i] = 0;
subst[')'] = 1488;
subst['B'] = 1489;
subst['G'] = 1490;
subst['D'] = 1491;
subst['H'] = 1492;
subst['W'] = 1493;
subst['Z'] = 1494;
subst['X'] = 1495;
subst['+'] = 1496;
subst['Y'] = 1497;
subst['k'] = 1498; // finals
subst['m'] = 1501;
subst['n'] = 1503;
subst['c'] = 1509;
subst['P'] = 1508;
subst['K'] = 1499;
subst['L'] = 1500;
subst['M'] = 1502;
subst['N'] = 1504;
subst['S'] = 1505;
subst['('] = 1506;
subst['p'] = 1507;
subst['C'] = 1510;
subst['Q'] = 1511;
subst['R'] = 1512;
subst['#'] = 1513;
// special multiChars
subst['&'] = 50;
subst['$'] = 50;
multiChars.put(new Integer('&'), new Integer[] {new Integer(1513), new Integer(1474)});
multiChars.put(new Integer('$'), new Integer[] {new Integer(1513), new Integer(1473)});
subst['T'] = 1514;
// VOWELS
subst['A'] = 1463;
subst['F'] = 1464;
subst['E'] = 1462;
subst['"'] = 1461;
subst['I'] = 1460;
subst['O'] = 1465;
subst['U'] = 1467;
// OTHER DIACRITICS
subst['.'] = 1468;
subst['-'] = 1470;
subst[','] = 1471;
// Compound input
for (int i = 2; i < 12; i++)
subst2[i] = new Hashtable();
// CANTILLATION
subst[':'] = 2;
subst2[2].put(new Integer('A'), new Integer(1458));
subst2[2].put(new Integer('E'), new Integer(1457));
subst2[2].put(new Integer('F'), new Integer(1459));
/* Telisha qetana is postpositive as in '04' above. However, Michigan
# code '24' is for a medial telisha. Graphically, there is no
# difference.
*/
subst['2'] = 5;
subst2[5].put(new Integer('4'), new Integer(1449));
/* Note Michigan encoding distinguishes between medial metheg '35' (occuring
# on the left of the vowel), and the ordinary meteg '95' (occuring on the
# right of the vowel). It is also used for silluq.
*/
subst['3'] = 6;
subst2[6].put(new Integer('3'), new Integer(1433));
subst2[6].put(new Integer('5'), new Integer(1469));
/* The Michigan code of telisha gedola in medial position. Graphically,
# there is no difference.
*/
subst['4'] = 7;
subst2[7].put(new Integer('4'), new Integer(1440));
subst['6'] = 8;
subst2[8].put(new Integer('0'), new Integer(1451));
subst2[8].put(new Integer('1'), new Integer(1436));
subst['1'] = 4;
subst2[4].put(new Integer('0'), new Integer(1434));
/* In the poetic books, prepositive dehi occurs; it's unclear whether
# tipeha also occurs in the poetic books. Otherwise, we could simply
# check for what book in the Tanach we are in. Michigan uses the same
# code for each.
*/
subst2[4].put(new Integer('3'), new Integer(1430));
/* This is the poetic accent mugrash, which also includes rebia, but is
# encoded separately as '81' in the Michigan text.
*/
subst2[4].put(new Integer('1'), new Integer(1437));
subst2[4].put(new Integer('4'), new Integer(1440));
subst['0'] = 3;
subst2[3].put(new Integer('0'), new Integer(1475));
subst2[3].put(new Integer('1'), new Integer(1426));
/* According to BHS, zarqa and sinnor are both postpositive. However,
# the Michigan encoding uses one code for both. The Unicode zarqa
# (0x0598) is definitely NOT postpositive. And further, the shape of
# the symbol is different in BHS and Uniocde. This needs further
# research to determine what's going on here. For now, we follow BHS
# and use the postpositive Unicode zinor or both accents.
*/
subst2[3].put(new Integer('2'), new Integer(1454));
/* Pashta is postpositive, and the Unicode equivalent reflects
# this. However, there is a poetic equivalent -- azla legarmeh --
# which is not postpositive, but no equivalent code point exists in
# Unicode. The Michigan encoding does not distinguish between the two,
# although it could be algorithmically determined.
*/
subst2[3].put(new Integer('3'), new Integer(1433));
subst2[3].put(new Integer('4'), new Integer(1449));
subst2[3].put(new Integer('5'), new Integer(1472));
/* This is the Unicode Hebrew *accent*; there is also another Hebrew
# *punctuation* called GERSHAYIM 0x05F4. I'm using the more
# traditional rounded marks, rather than the alternate straight
# marks.
*/
subst2[8].put(new Integer('2'), new Integer(1438));
// Also known as azla
subst2[8].put(new Integer('3'), new Integer(1448));
subst2[8].put(new Integer('4'), new Integer(1452));
subst2[8].put(new Integer('5'), new Integer(1427));
subst['8'] = 9;
subst2[9].put(new Integer('0'), new Integer(1428));
subst2[9].put(new Integer('1'), new Integer(1431));
/* Note, this accent is actually sinnorit, but it does not exist as a
# separate glyph in the Unicode standard. The 'ZINOR' Unicode accent
# is postpositive, while sinnorit is not. ZARQA is as close as I can
# get to this.
*/
subst2[9].put(new Integer('2'), new Integer(1432));
/* The Unicode form does not match the form used by BHS, but the names
# are the same.
*/
subst2[9].put(new Integer('3'), new Integer(1441));
subst2[9].put(new Integer('4'), new Integer(1439));
subst2[9].put(new Integer('5'), new Integer(1429));
subst['7'] = 10;
subst2[10].put(new Integer('0'), new Integer(1444));
subst2[10].put(new Integer('1'), new Integer(1445));
subst2[10].put(new Integer('2'), new Integer(1446));
subst2[10].put(new Integer('3'), new Integer(1430)); // also '13', '73' also is used for majela
subst2[10].put(new Integer('4'), new Integer(1443));
subst2[10].put(new Integer('5'), new Integer(1469)); // this is silluq; should appear to the left of the vowel
subst['9'] = 11;
subst2[11].put(new Integer('1'), new Integer(1435));
subst2[11].put(new Integer('2'), new Integer(1425));
subst2[11].put(new Integer('3'), new Integer(1450));
subst2[11].put(new Integer('4'), new Integer(1447));
subst2[11].put(new Integer('5'), new Integer(1469)); // should appear to the right of the vowel
}
}
/*
# CANTILLION MARKS
my $ETNAHTA = '֑';
# officially the Unicode name for this symbol was "SEGOL." However, that is
# not a unique name, conflicting with the vowel of the same name. Further,
# the position of the symbol is different. I have changed the name of the
# accent to "SEGOLTA," the traditional name for this accent.
my $SEGOLTA = '֒';
my $SHALSHELET = '֓';
my $ZAQEF_QATAN = '֔';
my $ZAQEF_GADOL = '֕';
my $TIPEHA = '֖';
my $REVIA = '֗';
my $ZARQA = '֘';
my $PASHTA = '֙';
my $YETIV = '֚';
my $TEVIR = '֛';
my $GERESH = '֜';
my $GERESH_MUQDAM = '֝';
my $GERSHAYIM = '֞';
my $QARNEY_PARA = '֟';
my $TELISHA_GEDOLA = '֠';
my $PAZER = '֡';
my $MUNAH = '֣';
my $MAHAPAKH = '֤';
my $MERKHA = '֥';
my $MERKHA_KEFULA = '֦';
my $DARGA = '֧';
my $QADMA = '֨';
my $TELISHA_QETANA = '֩';
my $YERAH_BEN_YOMO = '֪';
my $OLE = '֫';
my $ILUY = '֬';
my $DEHI = '֭';
my $ZINOR = '֮';
# HEBREW MARK
my $MASORA_CIRCLE = '֯';
# HEBREW EXTENDED-A points and punctuation
my $SHEVA = 'ְ';
my $HATAF_SEGOL = 'ֱ';
my $HATAF_PATAH = 'ֲ';
my $HATAF_QAMATS = 'ֳ';
my $HIRIQ = 'ִ';
my $TSERE = 'ֵ';
my $SEGOL = 'ֶ';
# furtive Patah is not a distinct character
my $PATAH = 'ַ';
my $QAMATS = 'ָ';
my $HOLAM = 'ֹ';
my $QUBUTS = 'ֻ';
# also used as shuruq
# falls within the base letter
my $DAGESH_OR_MAPIQ = 'ּ';
# also used as siluq
my $METAG = 'ֽ';
my $MAQAF = '־';
my $RAFE = 'ֿ';
# Also used for legarmeh
# may be treated as spacing punctuation, not as a point
my $PASEQ = '׀';
my $SHIN_DOT = 'ׁ';
my $SIN_DOT = 'ׂ';
my $SOF_PASUQ = '׃';
# HEBREW MARK
my $UPPER_DOT = 'ׄ';
# HEBREW LETTERS based on ISO 8859-8
# aleph
# x (alef symbol - 2135)
my $ALEF = 'א';
# x (bet symbol - 2136)
my $BET = 'ב';
# x (gimel symbol - 2137)
my $GIMEL = 'ג';
# x (dalet symbol - 2138)
my $DALET = 'ד';
my $HE = 'ה';
my $VAV = 'ו';
my $ZAYIN = 'ז';
my $HET = 'ח';
my $TET = 'ט';
my $YOD = 'י';
my $FINAL_KAF = 'ך';
my $KAF = 'כ';
my $LAMED = 'ל';
my $FINAL_MEM = 'ם';
my $MEM = 'מ';
my $FINAL_NUN = 'ן';
my $NUN = 'נ';
my $SAMEKH = 'ס';
my $AYIN = 'ע';
my $FINAL_PE = 'ף';
my $PE = 'פ';
my $FINAL_TSADI = 'ץ';
# also known as zade
my $TSADI = 'צ';
my $QOF = 'ק';
my $RESH = 'ר';
my $SHIN = 'ש';
my $TAV = 'ת';
# Yiddish digraphs
# Hebrew Ligature
# tsvey vovn
my $DOUBLE_VAV = 'װ';
my $VAV_YOD = 'ױ';
# tsvey yudn
my $DOUBLE_YOD = 'ײ';
# Additional punctuation
my $PUNCT_GERESH = '׳';
my $PUNCT_GERSHAYIM = '״';
# Reserved: 0x05F5"
# x (hebrew point judeo-spanish varika - FB1E)
#my $JUDEO_SPANISH_VARIKA = pack("U",0xFB1E); # UTF-8 OxFB1E
#############################
# End of Unicode 2.0 Hebrew #
#############################
# A hash whose key is a Michagan code, and whose value is a Unicode
# equvalent
char subst[] = new char [255];
subst[')'] = 1488;
'B' => $BET,
'G' => $GIMEL,
'D' => $DALET,
'H' => $HE,
'W' => $VAV,
'Z' => $ZAYIN,
'X' => $HET,
'+' => $TET,
'Y' => $YOD,
'K' => $KAF,
'L' => $LAMED,
'M' => $MEM,
'N' => $NUN,
'S' => $SAMEKH,
'(' => $AYIN,
'P' => $PE,
'C' => $TSADI,
'Q' => $QOF,
'R' => $RESH,
'#' => $SHIN, # the letter shin without a point
'&' => ($SHIN . $SIN_DOT),
'$' => ($SHIN . $SHIN_DOT), # '
'T' => $TAV,
# VOWELS
'A' => $PATAH,
'F' => $QAMATS,
'E' => $SEGOL,
'"' => $TSERE,
'I' => $HIRIQ,
'O' => $HOLAM,
'U' => $QUBUTS,
':' => $SHEVA,
':A' => $HATAF_PATAH,
':E' => $HATAF_SEGOL,
':F' => $HATAF_QAMATS,
# OTHER DIACRITICS
'.' => $DAGESH_OR_MAPIQ,
'-' => $MAQAF,
',' => $RAFE,
# CANTILLATION
'00' => $SOF_PASUQ,
'01' => $SEGOLTA,
# According to BHS, zarqa and sinnor are both postpositive. However,
# the Michigan encoding uses one code for both. The Unicode zarqa
# (0x0598) is definitely NOT postpositive. And further, the shape of
# the symbol is different in BHS and Uniocde. This needs further
# research to determine what's going on here. For now, we follow BHS
# and use the postpositive Unicode zinor or both accents.
'02' => $ZINOR,
# Pashta is postpositive, and the Unicode equivalent reflects
# this. However, there is a poetic equivalent -- azla legarmeh --
# which is not postpositive, but no equivalent code point exists in
# Unicode. The Michigan encoding does not distinguish between the two,
# although it could be algorithmically determined.
'03' => $PASHTA,
'04' => $TELISHA_QETANA,
'05' => $PASEQ,
'10' => $YETIV,
# In the poetic books, prepositive dehi occurs; it's unclear whether
# tipeha also occurs in the poetic books. Otherwise, we could simply
# check for what book in the Tanach we are in. Michigan uses the same
# code for each.
'13' => $TIPEHA, # also $DEHI
# This is the poetic accent mugrash, which also includes rebia, but is
# encoded separately as '81' in the Michigan text.
'11' => $GERESH_MUQDAM,
'14' => $TELISHA_GEDOLA,
# Telisha qetana is postpositive as in '04' above. However, Michigan
# code '24' is for a medial telisha. Graphically, there is no
# difference.
'24' => $TELISHA_QETANA,
'33' => $PASHTA,
# The Michigan code of telisha gedola in medial position. Graphically,
# there is no difference.
'44' => $TELISHA_GEDOLA,
'60' => $OLE,
'61' => $GERESH,
# This is the Unicode Hebrew *accent*; there is also another Hebrew
# *punctuation* called GERSHAYIM 0x05F4. I'm using the more
# traditional rounded marks, rather than the alternate straight
# marks.
'62' => $GERSHAYIM,
# Also known as azla
'63' => $QADMA,
'64' => $ILUY,
'65' => $SHALSHELET,
'80' => $ZAQEF_QATAN,
'81' => $REVIA,
# Note, this accent is actually sinnorit, but it does not exist as a
# separate glyph in the Unicode standard. The 'ZINOR' Unicode accent
# is postpositive, while sinnorit is not. ZARQA is as close as I can
# get to this.
'82' => $ZARQA,
# The Unicode form does not match the form used by BHS, but the names
# are the same.
'83' => $PAZER,
'84' => $QARNEY_PARA,
'85' => $ZAQEF_GADOL,
# Note Michigan encoding distinguishes between medial metheg '35' (occuring
# on the left of the vowel), and the ordinary meteg '95' (occuring on the
# right of the vowel). It is also used for silluq.
'35' => $METAG,
'70' => $MAHAPAKH,
'71' => $MERKHA,
'72' => $MERKHA_KEFULA,
'73' => $TIPEHA, # also '13', '73' also is used for majela
'74' => $MUNAH,
'75' => $METAG, # this is silluq; should appear to the left of the vowel
'91' => $TEVIR,
'92' => $ETNAHTA,
'93' => $YERAH_BEN_YOMO,
'94' => $DARGA,
'95' => $METAG, # should appear to the right of the vowel
# Not used by the Michigan Encoding
# $UPPER_DOT = '05C4';
);
# declare other variables
my (@bhsLines,
@bhsVerse,
@entity_line) = ();
my ($i,
$verse,
$word,
$character) = 0;
my ($element,
$saveGuttural) = "";
# read in a line
while (<>) {
# Process one verse
# iterate over every character and change to XML decimal entity
CHAR: for ( $i = 0; ($i < scalar(@bhsVerse)); $i++) {
# find and convert final kaf, mem, nun, pe, tsade
( # if final form
$bhsVerse[$i] =~ /[KMNPC]/
)
&&
(
( # whitespace or
$bhsVerse[$i+1] =~ /[ \-?]/
)
||
( # EOL or
$i == ( scalar(@bhsVerse) - 1 )
)
||
( # sof pasuq or
( $bhsVerse[$i+1] =~ /0/ ) &&
( $bhsVerse[$i+2] =~ /0/ )
)
||
( # one accent followed by white, eol or
(
( $bhsVerse[$i+1] =~ /\d/ ) &&
( $bhsVerse[$i+2] =~ /\d/ )
) &&
(
( $bhsVerse[$i+3] =~ /[ \-?]/ ) ||
( $i == ( scalar(@bhsVerse) - 1 ) )
)
)
||
( # two accents followed by white, eol
(
( $bhsVerse[$i+1] =~ /\d/ ) &&
( $bhsVerse[$i+2] =~ /\d/ ) &&
( $bhsVerse[$i+3] =~ /\d/ ) &&
( $bhsVerse[$i+4] =~ /\d/ )
) &&
(
( $bhsVerse[$i+5] =~ /[ \-?]/ ) ||
( $i == ( scalar(@bhsVerse) - 1 ) )
)
)
||
( # followed by a vowel and white, eol, sof pasuq
( $bhsVerse[$i+1] =~ /[:F]/ ) &&
( # followed by
( $bhsVerse[$i+2] =~ /[ \-?]/ ) || # whitespace or
( $i == ( scalar(@bhsVerse) - 1 ) ) || # eol or
( # sof pasuq
( $bhsVerse[$i+2] =~ /0/ ) &&
( $bhsVerse[$i+3] =~ /0/ )
)
)
)
) # end of what follows after final letter
&&
do {
$bhsVerse[$i] =~ /K/ && eval { push @entity_line,$FINAL_KAF; }
&& next CHAR;
$bhsVerse[$i] =~ /M/ && eval { push @entity_line,$FINAL_MEM; }
&& next CHAR;
$bhsVerse[$i] =~ /N/ && eval { push @entity_line,$FINAL_NUN; }
&& next CHAR;
$bhsVerse[$i] =~ /P/ && eval { push @entity_line,$FINAL_PE; }
&& next CHAR;
$bhsVerse[$i] =~ /C/ && eval { push @entity_line,$FINAL_TSADI; }
&& next CHAR;
};
# find and convert "furtive patach"
( $bhsVerse[$i] =~ /A/ ) && # If the letter is a patach
( $bhsVerse[$i-1] =~ /[)HX(]/ ) && # and is preceeded by a guttural
( ( $bhsVerse[$i-2] =~ /[AEFOU]/ ) || # and is preceeded by a vowel
( ( $bhsVerse[$i-2] =~ /\./ ) && # or by suruq
( $bhsVerse[$i-3] =~ /W/ ) ) || #
( ( $bhsVerse[$i-2] =~ /W/ ) && # or by holem (written plene)
( $bhsVerse[$i-3] =~ /O/ ) ) || #
( ( $bhsVerse[$i-2] =~ /Y/ ) && # or by hiriq-yod
( $bhsVerse[$i-3] =~ /I/ ) ) ) &&
do {
$saveGuttural = pop @entity_line; # snip off the gutteral
push @entity_line,$PATAH; # push on the patach
push @entity_line,$saveGuttural; # push back on the gutteral
next CHAR;
};
# convert cantillation
# since we have previously dealt with all other cases of
# numbers, two digit patterns are all we have to search for
$bhsVerse[$i] =~ /\d/ && $bhsVerse[$i+1] =~ /\d/ && do {
push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};
$i++; # accents are two digits long, so advance past the 2nd digit
next CHAR;
};
# convert katef vowels, which are two characters long
$bhsVerse[$i] =~ /:/ && $bhsVerse[$i+1] =~ /[AEF]/ && do {
push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};
$i++;
next CHAR;
};
# convert everything else
push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]"};
} # end CHAR
# print the line to standard output with XML character-level encoding
# each character has the following format:
# Ӓ
# set up the verse element
$word = 1;
$character = 1;
print "\n\n";
# print each character element
# if there is a space, then close the word entity, open a new word
# entity, increment the word number, reset the character number to
# zero.
foreach $element (@entity_line) {
if ( $element =~ " " ) {
$word++;
$character = 1;
print "\n\n";
next;
}
print "$element\n";
$character++;
}
# close the verse element
print "\n";
# reinitialize variables
@bhsVerse = ();
@entity_line = ();
@bhsLines = ();
} # end while
# close the XML document
print "