[sword-svn] r56 - trunk/modules/mt-lxx-parallel

Sun Dec 4 14:24:23 MST 2005

Author: mgruner
Date: 2005-12-04 14:24:14 -0700 (Sun, 04 Dec 2005)
New Revision: 56

Modified:
   trunk/modules/mt-lxx-parallel/convert.pl
Log:
numerous fixes. output file should be valid OSIS (I hope;)



Modified: trunk/modules/mt-lxx-parallel/convert.pl
===================================================================

--- trunk/modules/mt-lxx-parallel/convert.pl	2005-12-02 11:15:52 UTC (rev 55)
+++ trunk/modules/mt-lxx-parallel/convert.pl	2005-12-04 21:24:14 UTC (rev 56)
@@ -93,33 +93,6 @@
 "!" => "!", #occurs in the text!?
 "|" => "|", #occurs in the text!?
 
-# "*A" =>chr(0x0391), #GREEK CAPITAL LETTER ALPHA
-# "*B" =>chr(0x0392), #GREEK CAPITAL LETTER BETA
-# "*G" =>chr(0x0393), #GREEK CAPITAL LETTER GAMMA
-# "*D" =>chr(0x0394), #GREEK CAPITAL LETTER DELTA
-# "*E" =>chr(0x0395), #GREEK CAPITAL LETTER EPSILON
-# "*V" =>chr(0x03DC), #GREEK LETTER DIGAMMA
-# "*Z" =>chr(0x0396), #GREEK CAPITAL LETTER ZETA
-# "*H" =>chr(0x0397), #GREEK CAPITAL LETTER ETA
-# "*Q" =>chr(0x0398), #GREEK CAPITAL LETTER THETA
-# "*I" =>chr(0x0399), #GREEK CAPITAL LETTER IOTA
-# "*K" =>chr(0x039A), #GREEK CAPITAL LETTER KAPPA
-# "*L" =>chr(0x039B), #GREEK CAPITAL LETTER LAMDA
-# "*M" =>chr(0x039C), #GREEK CAPITAL LETTER MU
-# "*N" =>chr(0x039D), #GREEK CAPITAL LETTER NU
-# "*C" =>chr(0x039E), #GREEK CAPITAL LETTER XI
-# "*O" =>chr(0x039F), #GREEK CAPITAL LETTER OMICRON
-# "*P" =>chr(0x03A0), #GREEK CAPITAL LETTER PI
-# "*R" =>chr(0x03A1), #GREEK CAPITAL LETTER RHO
-# "*S" =>chr(0x03A3), #GREEK CAPITAL LETTER SIGMA
-# "*J" =>chr(0x03A3), #GREEK CAPITAL LETTER SIGMA #at end of Word
-# "*T" =>chr(0x03A4), #GREEK CAPITAL LETTER TAU
-# "*U" =>chr(0x03A5), #GREEK CAPITAL LETTER UPSILON
-# "*F" =>chr(0x03A6), #GREEK CAPITAL LETTER PHI
-# "*X" =>chr(0x03A7), #GREEK CAPITAL LETTER CHI
-# "*Y" =>chr(0x03A8), #GREEK CAPITAL LETTER PSI
-# "*W" =>chr(0x03A9), #GREEK CAPITAL LETTER OMEGA
-
 "A" =>chr(0x03B1), #GREEK SMALL LETTER ALPHA
 "B" =>chr(0x03B2), #GREEK SM LETT BETA / SM LETTER BETA BEGINNING OF WORD
 "G" =>chr(0x03B3), #GREEK SMALL LETTER GAMMA
@@ -251,6 +224,7 @@
 "=\%p" => "Difference in preposition or particle.",
 "=p\%" => "Difference in preposition or particle.",
 "=\%p?" => "Difference in preposition or particle?",
+"=\%?p" => "Difference in preposition or particle?",
 "=p" => "Difference in preposition or particle.", # TODO: my addition, check, uncertain?
 "={d}\%p" => "Difference in preposition or particle.", # TODO: my addition, check, uncertain? DOUBLET?
 "=\%pa" => "Difference in preposition or particle.", # TODO: my addition, check
@@ -260,6 +234,7 @@
 "=\%p+?" => "Addition of preposition or particle?",
 "=\%p-" => "Omission of preposition or particle.",
 "=\%p-?" => "Omission of preposition or particle?",
+"=\%?p-" => "Omission of preposition or particle?",
 "=p\%-" => "Omission of preposition or particle.", # TODO: my addition, check, uncertain?
 "=p-" => "Omission of preposition or particle.", # TODO: my addition, check, uncertain?
 "=;" => "Retroversion in col. b based on equivalence occurring in immediate or remote context.",
@@ -311,25 +286,17 @@
 
 );
 
-sub createNote(){
-	my $noteText = shift;
-	return("<note type=\"textual\">$noteText</note> ");
-}
-sub openNote(){
-	my $noteText = shift;
-	return("<note type=\"textual\">$noteText ");
-}
-sub closeNote(){
-	my $noteText = shift;
-	return("$noteText</note> ");
-}
+sub createNote(){ my $noteText = shift; return("<note type=\"textual\">$noteText</note> "); }
+sub openNote(){   my $noteText = shift;	return("<note type=\"textual\">$noteText "); }
+sub closeNote(){  my $noteText = shift;	return("$noteText</note> "); }
 
-
 sub translateHebrewNote(){
 	my $origNote = shift;
 
 #	print("TranslateHebrewNote $origNote\n");
 
+	(not $origNote) and die("Hebrew note empty.");
+
 	($origNote eq "=") and return;	#= only marks colB, no real note
 
 	($notes{ $origNote }) and return( &createNote( $notes{$origNote} ) );
@@ -403,7 +370,7 @@
  	($origNote =~ m/^.+[.].+$/) and #Occurs e.g.: "<gen1.1 ex1.2 lev3.3"
 		return $origNote;
 
-	($origNote =~ m/^[?].*/) and 
+	($origNote =~ m/^[?](.*)/) and 
 		return( &createNote( $notes{"?"} ) . &translateHebrewWordorNote( $1 ) );
 
 	($origNote =~ m/^(.+),(.+)$/) and 	# 2 Notes / Words, split up, but only at the end
@@ -438,6 +405,8 @@
 
 #	print("TranslateGreekNote $origNote\n");
 
+	(not $origNote) and die("Greek note empty.");
+
 	($notes{ $origNote }) and return( &createNote( $notes{$origNote} ) );
 
 	($origNote =~ m/^\[(.+)\]?/) and 
@@ -540,6 +509,8 @@
 sub translateHebrewWordorNote(){ #will return unicode hebrew with morph separation
 	my $hebrew = shift;
 
+	if (not $hebrew) { die("Hebrew string empty.") };
+
 # 	print("TranslateHebrew of: $hebrew\n");
 
 	$hebrew =~ s/^mn$/.mn/;	#Ezek 24:17, error?
@@ -568,6 +539,7 @@
 
 	my $greek = shift;
 
+	if (not $greek) { die("Greek string empty.") };
 #	printf("TranslateGreek of $greek\n");
 
 	( $notes {$greek} ) and return &translateGreekNote( $greek ); # exact match first
@@ -617,22 +589,24 @@
 	$origLine =~ s/=a\$\/DY/=A\$\/DY/;# TODO: UGLY HACK, Hebrew letter wrong
 	$origLine =~ s/{\.\.\^EPIQEI\\S\.\.\^E\)FI\/LHSA}/{..^EPIQEI\\S E)FI\/LHSA}/;# TODO: UGLY HACK, strange note
 	$origLine =~ s/E\t\)KPE\/SH\|/\tE)KPE\/SH|/; #occurs, tab misplaced
-
 	$origLine =~ s/^\(..r\(L\/YK}/{..r(L\/YK}/; # in EZEK
 
 	$origLine =~ s/^DANW {t}$/DANW\t{t}/; # in DAN
-
 	$origLine =~ s/AI\)W=NOS\[110\.10/AI)W=NOS [110.10/; # in PS
 	$origLine =~ s/W\/YD\(Y{\*\*}/W\/YD(Y {**}/; # in PS
-	$origLine =~ s/{\.1\.dU\(PE\\R}/{..dU(PE\R}/; # in PS
+	$origLine =~ s/{\.1\.dU\(PE\\R}/{..dU(PE\\R}/; # in PS
 
-
 	$origLine =~ m/^W\(\/SPER/ and return;	#ignore, probably an error
 
 	($origLine eq "W/)T H/GRG\$Y ^ =W/)T W/H/)MRY KAI\\ TO\\N AMORRAI=ON ") and 
 		$origLine = "W/)T H/GRG\$Y ^ =W/)T W/H/)MRY\tKAI\\ TO\\N AMORRAI=ON"; # TODO: hack, Tab missing
-	($origLine eq "W/H/KHNYM =W/H/)BNYM .m .kb # KAI\\ OI( LI/QOI ") and 
-		$origLine = "W/H/KHNYM =W/H/)BNYM .m .kb\tKAI\\ OI( LI/QOI"; # TODO: hack, Tab missing
+
+	($origLine eq "W/H/KHNYM =W/H/)BNYM .m .kb # KAI\\ OI( LI/QOI ") and # in JoshB: Tab misplaced
+		$origLine = "W/H/KHNYM =W/H/)BNYM .m .kb\tKAI\ OI( LI/QOI"; # TODO: hack, Tab missing
+
+	($origLine eq "{...?AU)TOU=} MDBR =v\tLALOU=NTOS") and 
+		$origLine = "MDBR =v\tLALOU=NTOS"; # In EZEK: TODO: error, greek in first col
+
 	($origLine eq "W/YC+YRW =;W/YC+YDW .rd <9.12 E)PESITI/SANTO {d} KAI\\ H(TOIMA/SANTO ") and 
 		$origLine = "W/YC+YRW =;W/YC+YDW .rd <9.12\tE)PESITI/SANTO {d} KAI\\ H(TOIMA/SANTO"; # TODO: hack, Tab missing
 	($origLine eq "W/YBW) {...EI)S}\tKAI\\ EI)SH=LQEN") and 
@@ -640,7 +614,7 @@
 	($origLine eq "W/L) {..^OU)}\tDE\\") and 
 		$origLine = "W/L)\t{..^OU)} DE\\"; # TODO: hack, TAB misplaced
 
-# 	printf("parsing %s\n", $origLine);
+# 	print("parsing %s\n", $origLine);
 
 	($origLine =~ m/^([^=\t]+)?([=][^\t]*)?\t(.+)$/) or die("No match in parseLine().\n");
 	($1 or $2) or die("Hebrew not found.\n");
@@ -655,25 +629,17 @@
 
 	$result .= "<row>\n  <cell>";
 	foreach my $wordA (@hebrewWordsColA){
-		$result .= &translateHebrewWordorNote( $wordA ) . " ";
+		($wordA) and $result .= &translateHebrewWordorNote( $wordA ) . " ";
 	}
 	$result .= "</cell>\n  <cell>";
 
 	foreach my $wordB (@hebrewWordsColB){
-			$result .= &translateHebrewWordorNote( $wordB ) . " ";
+		($wordB) and $result .= &translateHebrewWordorNote( $wordB ) . " ";
 	}
 	$result .= "</cell>\n  <cell>";
 
 	foreach my $wordG (@greekWords){
-#		if ( $greekWords[$index] eq "{x}" ){ #special case: note containing a space, has to be handled together
-#			$result .= &translateGreekWordorNote( "$wordG $greekWords[$index+1]" );
-#			$index += 2;
-#		}
-#		elsif ( $wordG eq "{x}" ){	#skip
-#			++$index;
-#		}
-#		else{
-		$result .= &translateGreekWordorNote( $wordG ). " ";
+		($wordG) and $result .= &translateGreekWordorNote( $wordG ). " ";
 	}
 	$result .= "</cell>\n</row>";
 #  	printf("Result: %s\n", $result);
@@ -729,16 +695,26 @@
 
 	my @result;
 
+	push(@result, "<div type=\"book\" osisID=\"$osis_id\">");
+
 	CHAPTER: foreach my $chapter(1..1000){
+		my $chapter_header_written;
 		print("Processing $bookname_infile chapter $chapter.\n");
 		my $verse_found;
 		VERSE: foreach my $verse(1..1000){
 			my @verseContent = &grabVerseContent($bookname_infile, $chapter, $verse, @BUF);
 			if (@verseContent) {
 				if ($bookname_infile eq "Obad"){
+					if (not $chapter_header_written) { 
+						$chapter_header_written = 1; #no chapters in Obadiah
+					}
 					push(@result, "<verse osisID=\"$osis_id.$verse\">"); #chapter will be ignored for >1 by grabVerseContent
 				}
 				else{
+					if (not $chapter_header_written) { 
+						push(@result, "<chapter osisID=\"$osis_id.$chapter\">");
+						$chapter_header_written = 1;
+					}
 					push(@result, "<verse osisID=\"$osis_id.$chapter.$verse\">");
 				}
 				push(@result, @verseContent);
@@ -746,6 +722,9 @@
 				$verse_found = 1;
 			}
 			else{ #verse nonexistent, goto next chapter
+				if ($chapter_header_written and (not $bookname_infile eq "Obad") ) { 
+					push(@result, "</chapter>");
+				}
 				last VERSE;
 			}
 		}
@@ -754,8 +733,11 @@
 			last CHAPTER;
 		}
 	}
+
+	push(@result, "</div>"); #book
+	print("done.\n");
+
 	return(@result);
-	print("done.\n");
 }
 
 sub processBookVariant(){
@@ -776,17 +758,25 @@
 
 	my @result;
 
+	push(@result, "<div type=\"book\" osisID=\"$osis_id\">");
+
 	CHAPTER: foreach my $chapter(1..1000){
 		print("Processing $bookname_infile_A and $bookname_infile_B chapter $chapter.\n");
+		my $chapter_header_written;
 		my $verse_found;
 		VERSE: foreach my $verse(1..1000){
 			my @verseContentA = &grabVerseContent($bookname_infile_A, $chapter, $verse, @BUFA);
 			my @verseContentB = &grabVerseContent($bookname_infile_B, $chapter, $verse, @BUFB);
 			if (@verseContentA or @verseContentB) { 
+				if (not $chapter_header_written) { 
+					push(@result, "<chapter osisID=\"$osis_id.$chapter\">");
+					$chapter_header_written = 1;
+				}
 				push(@result, "<verse osisID=\"$osis_id.$chapter.$verse\">");
 				$verse_found = 1;
 			}
 			else{ #verse nonexistent, goto next chapter
+				if ($chapter_header_written) { push(@result, "</chapter>"); }
 				last VERSE;
 			}
 			if (@verseContentA){
@@ -805,9 +795,12 @@
 			last CHAPTER;
 		}
 	}
-	return(@result);
+
+	push(@result, "</div>"); #book
 	print("done.\n");
 
+	return(@result);
+
 }
 
 sub loadFile(){ #$fileName	loads the file into the buffer and makes small corrections
@@ -818,47 +811,47 @@
 
 	my @result;
 	my $index = 0;
-	foreach my $currentItem (@buffer){
-		if ($buffer[$index] =~ m/^DANIHL/){
+	LOOP: foreach my $currentItem (@buffer){
+		if ($currentItem =~ m/^DANIHL/){
 			$result[$#result] .= " " .$buffer[$index];
 		}
-		elsif ($buffer[$index] =~ m/^NUMA/){
+		elsif ($currentItem =~ m/^NUMA/){
 			$result[$#result] .= $buffer[$index];
 		}
-		elsif ($buffer[$index] =~ m/^DEUTERONO\/MION/){
+		elsif ($currentItem =~ m/^DEUTERONO\/MION/){
 			$result[$#result] .= " ".$buffer[$index];
 		}
-		elsif ($buffer[$index] =~ m/^AU\)TOU=/){
+		elsif ($currentItem =~ m/^AU\)TOU=/){
 			$result[$#result] .= " ".$buffer[$index];
 		}
-		elsif ($buffer[$index] =~ m/^E\(\/C/){
+		elsif ($currentItem =~ m/^E\(\/C/){
 			$result[$#result] .= " ". $buffer[$index];
 		}
-		elsif ($buffer[$index] =~ m/^MOU/){
+		elsif ($currentItem =~ m/^MOU/){
 			$result[$#result] .= " " . $buffer[$index];
 		}
-		elsif ($buffer[$index] =~ m/^NEHL$/){
+		elsif ($currentItem =~ m/^NEHL$/){
 			$result[$#result] .= $buffer[$index]; # no space, ANANEL
 		}
-		elsif ($buffer[$index] =~ m/^ESTHKE\/NAI$/){
+		elsif ($currentItem =~ m/^ESTHKE\/NAI$/){
 			$result[$#result] .= $buffer[$index]; # no space
 		}
-		elsif ($buffer[$index] =~ m/^ESTHKW\\S$/){
+		elsif ($currentItem =~ m/^ESTHKW\\S$/){
 			$result[$#result] .= $buffer[$index]; # no space
 		}
-		elsif ($buffer[$index] =~ m/^ISA/){	# a few lines in ISAIAH have this in different styles
+		elsif ($currentItem =~ m/^ISA/){	# a few lines in ISAIAH have this in different styles
 			$result[$#result] .= $buffer[$index]; # no space
 		}
-		elsif ($buffer[$index] =~ m/^LAMYAN/){	# in LAM
+		elsif ($currentItem =~ m/^LAMYAN/){	# in LAM
 			$result[$#result] .= $buffer[$index]; # no space
 		}
-		elsif ($buffer[$index] =~ m/^EZEKIHL/){	# in LAM
+		elsif ($currentItem =~ m/^EZEKIHL/){	# in LAM
 			$result[$#result] .= $buffer[$index]; # no space
 		}
-		elsif ($buffer[$index] =~ m/^\)$/){	# in PS
+		elsif ($currentItem =~ m/^\)$/){	# in PS
 			$result[$#result] .= $buffer[$index]; # no space
 		}
-		elsif ($buffer[$index] =~ m/^PS[Y\s]/){	# in PS; breaks at PS or PSY
+		elsif ($currentItem =~ m/^PS[Y\s]/){	# in PS; breaks at PS or PSY
 			$result[$#result] .= $buffer[$index]; # no space
 		}
 		elsif (($buffer[$index+1] =~ m/^#/) && ($buffer[$index] =~ m/^(.*)#$/)){	# in Daniel, # is used as a "continue line on next line" marker
@@ -878,14 +871,30 @@
 my @result;
 
 
+
+push(@result,"<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n");
+
+push(@result,"<osis xmlns=\"http://www.bibletechnologies.net/2003/OSIS/namespace\"  xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.bibletechnologies.net/2003/OSIS/namespace osisCore.2.1.xsd\">\n");
+
+push(@result,"<osisText osisIDWork=\"MT-LXX-Parallel\" xml:lang=\"en\">\n");
+
+push(@result,"
+<header>\
+  <work osisWork=\"MT-LXX-Parallel\">\
+    <title>The Parallel Aligned Hebrew-Aramaic and Greek texts of Jewish Scripture</title>\
+    <identifier type=\"OSIS\">MT-LXX-Parallel</identifier>\
+    <refSystem>Bible.Tanach</refSystem>\
+  </work>\
+</header>\n");
+
 	# File				File id			ThML id		OSIS id		Short Book Title
-#push(@result, &processBook("01.Genesis.par", "Gen", "Gen", "Gen", "Genesis") );
-#push(@result, &processBook("02.Exodus.par", "Exod", "Exod", "Exod", "Exodus") );
-#push(@result, &processBook("03.Lev.par", "Lev", "Lev", "Lev", "Leviticus") );
-#push(@result, &processBook("04.Num.par", "Num", "Num", "Num", "Numbers") );
-#push(@result, &processBook("05.Deut.par", "Deut", "Deut", "Deut", "Deuteronomy") );
-#push(@result, &processBookVariant("07.JoshA.par", "JoshA", "Codex Alexandrinus:", "06.JoshB.par", "JoshB", "Codex Vaticanus:", "Josh", "Josh", "Joshua") );
-#push(@result, &processBookVariant("09.JudgesA.par", "JudgA", "Codex Alexandrinus:", "08.JudgesB.par", "JudgB", "Codex Vaticanus:", "Judg", "Judg", "Judges") );
+# push(@result, &processBook("01.Genesis.par", "Gen", "Gen", "Gen", "Genesis") );
+# push(@result, &processBook("02.Exodus.par", "Exod", "Exod", "Exod", "Exodus") );
+# push(@result, &processBook("03.Lev.par", "Lev", "Lev", "Lev", "Leviticus") );
+# push(@result, &processBook("04.Num.par", "Num", "Num", "Num", "Numbers") );
+# push(@result, &processBook("05.Deut.par", "Deut", "Deut", "Deut", "Deuteronomy") );
+# push(@result, &processBookVariant("07.JoshA.par", "JoshA", "Codex Alexandrinus:", "06.JoshB.par", "JoshB", "Codex Vaticanus:", "Josh", "Josh", "Joshua") );
+# push(@result, &processBookVariant("09.JudgesA.par", "JudgA", "Codex Alexandrinus:", "08.JudgesB.par", "JudgB", "Codex Vaticanus:", "Judg", "Judg", "Judges") );
   
 # push(@result, &processBook("10.Ruth.par", "Ruth", "Ruth", "Ruth", "Ruth") );
 # push(@result, &processBook("11.1Sam.par", "1Sam/K", "iSam", "1Sam", "1 Samuel") );
@@ -899,7 +908,6 @@
 # push(@result, &processBook("18.Esther.par", "Esth", "Esth", "Esth", "Esther") );
 # push(@result, &processBook("26.Job.par", "Job", "Job", "Job", "Job") );
 #  
-#  #This might need special handling
 #push(@result, &processBook("20.Psalms.par", "Ps", "Ps", "Ps", "Psalms"));
 #  
 # push(@result, &processBook("23.Prov.par", "Prov", "Prov", "Prov", "Proverbs") );
@@ -909,9 +917,9 @@
 # push(@result, &processBook("41.Jer.par", "Jer", "Jer", "Jer", "Jeremiah") );
 # push(@result, &processBook("43.Lam.par", "Lam", "Lam", "Lam", "Lamentations") );
 # push(@result, &processBook("44.Ezekiel.par", "Ezek", "Ezek", "Ezek", "Ezekiel") );
-# 
+ 
 # push(@result, &processBookVariant("45.DanielOG.par", "Dan", "Old Greek:", "46.DanielTh.par", "DanTh", "Theodotion:", "Dan", "Dan", "Daniel"));
-# 
+ 
 # push(@result, &processBook("28.Hosea.par", "Hos", "Hos", "Hos", "Hosea") );
 # push(@result, &processBook("31.Joel.par", "Joel", "Joel", "Joel", "Joel") );
 # push(@result, &processBook("30.Amos.par", "Amos", "Amos", "Amos", "Amos") );
@@ -925,6 +933,8 @@
 # push(@result, &processBook("38.Zech.par", "Zech", "Zech", "Zech", "Zechariah") );
 # push(@result, &processBook("39.Malachi.par", "Mal", "Mal", "Mal", "Malachi") );
 
+push(@result, "</osisText>\n</osis>");
 
-print( join("\n", @result) );
+open( OUTPUT, ">mt-lxx-par.osis.xml" );
+print( OUTPUT join("\n", @result) );
  
\ No newline at end of file