[sword-svn] r503 - trunk/modules/conf
refdoc at crosswire.org
refdoc at crosswire.org
Thu Sep 3 18:46:46 MST 2015
Author: refdoc
Date: 2015-09-03 18:46:46 -0700 (Thu, 03 Sep 2015)
New Revision: 503
Modified:
trunk/modules/conf/confmaker.pl
Log:
added handling of UTF8 stripfilters.
Modified: trunk/modules/conf/confmaker.pl
===================================================================
--- trunk/modules/conf/confmaker.pl 2015-08-28 00:00:54 UTC (rev 502)
+++ trunk/modules/conf/confmaker.pl 2015-09-04 01:46:46 UTC (rev 503)
@@ -41,9 +41,12 @@
use XML::LibXML;
use I18N::LangTags::List;
use Unicode::UCD 'charinfo';
-use open ':std', ':encoding(UTF-8)';
+#use open ':std', ':encoding(UTF-8)';
+#use open qw/:std :utf8/;
+use utf8;
+use Sword;
+use HTML::Strip;
-
## Obtain arguments
if (scalar(@ARGV) < 1) {
print "\nconfmaker.pl -- - provides a initial conf file for a new module by analysing given OSIS xml file.\n";
@@ -90,8 +93,19 @@
my $parser = XML::LibXML->new();
my $doc = $parser->parse_file($file);
+my $manager = new Sword::SWMgr();
+$manager->setGlobalOption("Hebrew Vowel Points", "Off");
+$manager->setGlobalOption("Hebrew Cantillation", "Off");
+$manager->setGlobalOption("Arabic Vowel Points", "Off");
+$manager->setGlobalOption("Greek Accents", "Off");
+my $hs = HTML::Strip->new();
+my $doc_text = new Sword::SWBuf($hs->parse($doc->toString()));
+#my $clean_doc_text = $hs->parse( $doc->toString());
+#$doc_text->append($clean_doc_text);
+
+
## obtain name, type and language
my @elements = $doc->getElementsByTagName('osisText');
@@ -100,7 +114,9 @@
my $doc_type = @elements[0]->getAttribute('osisRefWork');
my $doc_lang = @elements[0]->getAttribute('xml:lang');
my $doc_lang_name=I18N::LangTags::List::name($doc_lang);
+;
+
if ((length($language)==0) && (length($doc_lang)==0)) {
print STDERR $language."\n", $doc_lang."\n", $doc_lang_name."\n";
@@ -123,8 +139,8 @@
my @doc_features = ('title', 'note', 'reference', 'q', 'figure', 'rdg');
my @word_features = ('lemma', 'gloss', 'morph',);
+my @char_features = ('Hebrew Vowel Points', 'Arabic Vowel Points', 'Hebrew Cantillation', 'Greek Accents');
-
my %doc_filters = ( 'title' => "OSISHeadings",
'note' => "OSISFootnotes",
'reference' => "OSISScripref",
@@ -134,12 +150,20 @@
'q' => "OSISRedLetterWords",
'rdg' => 'OSISVariants',
);
+
my %doc_feature = ( 'lemma' => 'StrongsNumbers',
'figure' => 'Images',
'p' => 'NoParagraphs',
);
+
+my %diacritics = ( 'Hebrew Vowel Points' => "UTF8HebrewPoints",
+ 'Arabic Vowel Points' => 'UTF8ArabicPoints',
+ 'Hebrew Cantillation' => 'UTF8Cantillation',
+ 'Greek Accents' => 'UTF8GreekAccents',
+ );
+
my %doc_has_feature;
@@ -165,6 +189,8 @@
my @paragraphs = $doc->getElementsByTagName('p');
if (@paragraphs==0) {$doc_has_feature{'p'}=true};
+
+
# Assemble and print out
@@ -189,6 +215,21 @@
print "GlobalOptionFilter=".$doc_filters{$_}."\n"
}
}
+
+foreach $filter(@char_features) {
+
+ my $tmp = new Sword::SWBuf($hs->parse($doc->toString()));
+
+ $manager->filterText($filter, $tmp);
+
+ if ($tmp->c_str() ne $doc_text->c_str()) {
+ print "GlobalOptionFilter=".%diacritics{$filter}."\n";
+
+ }
+}
+
+
+
foreach (@doc_features) {
if ($doc_has_feature{$_} && exists $doc_feature{$_}) {
print "Feature=".$doc_feature{$_}."\n"
@@ -206,7 +247,7 @@
print "Encoding=UTF-8\n";
print "SourceType=OSIS\n";
print "LCSH=".$doc_type.".".I18N::LangTags::List::name($doc_lang)."\n";
-print "SwordVersionDate=".`date +"%F"`."\n";
+print "SwordVersionDate=".`date +"%F"`;
if (@inputFile>0) {
foreach(@inputFile) {
More information about the sword-cvs
mailing list