[sword-svn] r503 - trunk/modules/conf

refdoc at crosswire.org refdoc at crosswire.org
Thu Sep 3 18:46:46 MST 2015


Author: refdoc
Date: 2015-09-03 18:46:46 -0700 (Thu, 03 Sep 2015)
New Revision: 503

Modified:
   trunk/modules/conf/confmaker.pl
Log:
added handling of UTF8 stripfilters. 



Modified: trunk/modules/conf/confmaker.pl
===================================================================
--- trunk/modules/conf/confmaker.pl	2015-08-28 00:00:54 UTC (rev 502)
+++ trunk/modules/conf/confmaker.pl	2015-09-04 01:46:46 UTC (rev 503)
@@ -41,9 +41,12 @@
 use XML::LibXML;
 use I18N::LangTags::List;
 use Unicode::UCD 'charinfo';
-use open ':std', ':encoding(UTF-8)';
+#use open ':std', ':encoding(UTF-8)';
+#use open qw/:std :utf8/;
+use utf8;
+use Sword;
+use HTML::Strip;
 
-
 ## Obtain arguments
 if (scalar(@ARGV) < 1) {
     print "\nconfmaker.pl -- - provides a initial conf file for a new module by analysing  given OSIS xml file.\n";
@@ -90,8 +93,19 @@
 my $parser = XML::LibXML->new();
 my $doc = $parser->parse_file($file);
 
+my $manager = new Sword::SWMgr();
 
+$manager->setGlobalOption("Hebrew Vowel Points", "Off");
+$manager->setGlobalOption("Hebrew Cantillation", "Off");
+$manager->setGlobalOption("Arabic Vowel Points", "Off");
+$manager->setGlobalOption("Greek Accents", "Off");
 
+my $hs = HTML::Strip->new();
+my $doc_text = new Sword::SWBuf($hs->parse($doc->toString()));
+#my $clean_doc_text = $hs->parse( $doc->toString());
+#$doc_text->append($clean_doc_text);
+
+
 ## obtain name, type and language
 
 my @elements = $doc->getElementsByTagName('osisText');
@@ -100,7 +114,9 @@
 my $doc_type = @elements[0]->getAttribute('osisRefWork');
 my $doc_lang = @elements[0]->getAttribute('xml:lang');
 my $doc_lang_name=I18N::LangTags::List::name($doc_lang);
+;
 
+ 
 
 if ((length($language)==0) && (length($doc_lang)==0)) {
    print STDERR $language."\n", $doc_lang."\n", $doc_lang_name."\n";
@@ -123,8 +139,8 @@
 
 my @doc_features = ('title', 'note', 'reference', 'q', 'figure', 'rdg');
 my @word_features = ('lemma', 'gloss', 'morph',);
+my @char_features = ('Hebrew Vowel Points', 'Arabic Vowel Points', 'Hebrew Cantillation', 'Greek Accents');
 
-
 my %doc_filters = ( 'title' => "OSISHeadings",
              'note'  => "OSISFootnotes",
              'reference' => "OSISScripref",
@@ -134,12 +150,20 @@
              'q'  => "OSISRedLetterWords",
              'rdg' => 'OSISVariants',
             );
+
             
 my %doc_feature = ( 'lemma' => 'StrongsNumbers',
                     'figure' => 'Images',
                      'p'  => 'NoParagraphs',
 
                   );
+
+my %diacritics = ( 'Hebrew Vowel Points' => "UTF8HebrewPoints",
+                   'Arabic Vowel Points' => 'UTF8ArabicPoints',
+                   'Hebrew Cantillation' => 'UTF8Cantillation',
+                   'Greek Accents' 	 => 'UTF8GreekAccents',
+                 );
+
             
 my %doc_has_feature;
 
@@ -165,6 +189,8 @@
 
 my @paragraphs = $doc->getElementsByTagName('p');
 if (@paragraphs==0) {$doc_has_feature{'p'}=true};
+
+
    
 # Assemble and print out
 
@@ -189,6 +215,21 @@
       print  "GlobalOptionFilter=".$doc_filters{$_}."\n"
       }
    }   
+
+foreach $filter(@char_features) {
+   
+   my $tmp = new Sword::SWBuf($hs->parse($doc->toString()));
+   
+   $manager->filterText($filter, $tmp);
+
+   if ($tmp->c_str() ne $doc_text->c_str()) {
+      print "GlobalOptionFilter=".%diacritics{$filter}."\n";
+      
+   }
+}
+
+
+      
 foreach (@doc_features) {
    if ($doc_has_feature{$_} && exists $doc_feature{$_}) { 
       print  "Feature=".$doc_feature{$_}."\n"
@@ -206,7 +247,7 @@
 print  "Encoding=UTF-8\n";
 print  "SourceType=OSIS\n";
 print  "LCSH=".$doc_type.".".I18N::LangTags::List::name($doc_lang)."\n";
-print  "SwordVersionDate=".`date +"%F"`."\n";
+print  "SwordVersionDate=".`date +"%F"`;
 
 if (@inputFile>0) {
    foreach(@inputFile) {




More information about the sword-cvs mailing list