[sword-svn] r334 - trunk/modules/misc_cleanup
refdoc at crosswire.org
refdoc at crosswire.org
Fri Jul 8 02:59:24 MST 2011
Author: refdoc
Date: 2011-07-08 02:59:24 -0700 (Fri, 08 Jul 2011)
New Revision: 334
Added:
trunk/modules/misc_cleanup/usfm_charmap.pl
Log:
added charmap for USFM files, disregarding tags
Added: trunk/modules/misc_cleanup/usfm_charmap.pl
===================================================================
--- trunk/modules/misc_cleanup/usfm_charmap.pl (rev 0)
+++ trunk/modules/misc_cleanup/usfm_charmap.pl 2011-07-08 09:59:24 UTC (rev 334)
@@ -0,0 +1,94 @@
+#!/usr/bin/perl
+
+## Licensed under the standard BSD license:
+
+# Copyright (c) 2002-2011 CrossWire Bible Society <http://www.crosswire.org/>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of the CrossWire Bible Society nor the names of
+# its contributors may be used to endorse or promote products
+# derived from this software without specific prior written
+# permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+## For general inquiries, comments, suggestions, bug reports, etc. email:
+## sword-support at crosswire.org
+
+#########################################################################
+
+use strict;
+use Unicode::UCD 'charinfo';
+binmode (STDOUT,":utf8");
+
+## Obtain arguments
+if (scalar(@ARGV) < 1) {
+ print "\nusfm_charmap.pl <osisfile> [-o outputfile]\n\n";
+ print "- prints a list of characters in text parts of an USFM file, ignoring tags etc\n";
+ print "- Arguments in braces < > are required. Arguments in brackets [ ] are optional.\n";
+ print "- If no -o option is specified the output goes to <STDOUT>.\n";
+ exit (-1);
+}
+
+my $file = @ARGV[0];
+my $outputFilename;
+my %list;
+
+if (@ARGV[1] eq "-o") {
+ $outputFilename = "@ARGV[2]";
+ open (OUTF , ">:utf8", "$outputFilename") or die "Could not open file @ARGV[2] for writing.";
+ select(OUTF);
+}
+
+## Initialise OSIS file
+open INF, "<:utf8",$file;
+my @doc = <INF>;
+close INF;
+my $doc=join("\n", at doc);
+# count out the characters in text nodes only
+
+$doc =~ s/\\id (.+)?\n//g;
+$doc =~ s/\\rem (.+)?\n//g;
+$doc =~ s/\\c [0-9]+\s*\n//g;
+$doc =~ s/\\v [0-9]+(-[0-9]+)?\s+//g;
+$doc =~ s/\\[a-z]+\s*\n//g;
+$doc =~ s/\\[a-z]+([0-9])?\ //g;
+$doc =~ s/\\[a-z]+([0-9])?\*//g;
+
+my @complete = split(//,$doc);
+
+foreach (@complete) {
+ my $char=$_;
+ $list{$char}++;
+}
+# print results
+
+foreach my $key(sort keys %list) {
+ my $c;
+ if ($key =~ /\p{Cc}/) { $c = " "}
+ else { $c = $key }
+
+ my $ci = charinfo(ord($key));
+
+ print "\t".$c."\tU+".$ci->{'code'}."\t".$list{$key}."\t".$ci->{'script'}."\t".$ci->{'name'}."\n";
+ }
Property changes on: trunk/modules/misc_cleanup/usfm_charmap.pl
___________________________________________________________________
Added: svn:executable
+ *
More information about the sword-cvs
mailing list