[sword-svn] r2862 - in trunk: include src/modules/filters

Mon Jul 8 02:08:04 MST 2013

Author: chrislit
Date: 2013-07-08 02:08:04 -0700 (Mon, 08 Jul 2013)
New Revision: 2862

Modified:
   trunk/include/utf8arabicpoints.h
   trunk/src/modules/filters/utf8arabicpoints.cpp
Log:
applied submitted UTF8ArabicPoints code (needs testing because I replaced deprecated bcopy with memmove)

Modified: trunk/include/utf8arabicpoints.h
===================================================================

--- trunk/include/utf8arabicpoints.h	2013-07-08 09:01:53 UTC (rev 2861)
+++ trunk/include/utf8arabicpoints.h	2013-07-08 09:08:04 UTC (rev 2862)
@@ -35,6 +35,8 @@
 	UTF8ArabicPoints();
 	virtual ~UTF8ArabicPoints();
 	virtual char processText(SWBuf &text, const SWKey *key = 0, const SWModule *module = 0);
+private:
+	static char* UTF8ArabicPoints::next_mark(const char* from, int* mark_size);
 };
 
 SWORD_NAMESPACE_END

Modified: trunk/src/modules/filters/utf8arabicpoints.cpp
===================================================================
--- trunk/src/modules/filters/utf8arabicpoints.cpp	2013-07-08 09:01:53 UTC (rev 2861)
+++ trunk/src/modules/filters/utf8arabicpoints.cpp	2013-07-08 09:08:04 UTC (rev 2862)
@@ -40,24 +40,140 @@
 
 UTF8ArabicPoints::~UTF8ArabicPoints(){};
 
+char* UTF8ArabicPoints::next_mark(const char* from, int* mark_size)
+{
+	// Arabic vowel points currently targeted for elimination:
+	// Table entries excerpted from
+	// http://www.utf8-chartable.de/unicode-utf8-table.pl.
+	// Code   UTF-8     Description
+	// point
+	// -----  --------- -----------
+	// U+064B d9 8b     ARABIC FATHATAN
+	// U+064C d9 8c     ARABIC DAMMATAN
+	// U+064D d9 8d     ARABIC KASRATAN
+	// U+064E d9 8e     ARABIC FATHA
+	// U+064F d9 8f     ARABIC DAMMA
+	// U+0650 d9 90     ARABIC KASRA
+	// U+0651 d9 91     ARABIC SHADDA
+	// U+0652 d9 92     ARABIC SUKUN
+	// U+0653 d9 93     ARABIC MADDAH ABOVE
+	// U+0654 d9 94     ARABIC HAMZA ABOVE
+	// U+0655 d9 95     ARABIC HAMZA BELOW
+	//
+	// U+FC5E ef b1 9e  ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM
+	// U+FC5F ef b1 9f  ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM
+	// U+FC60 ef b1 a0  ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM
+	// U+FC61 ef b1 a1  ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM
+	// U+FC62 ef b1 a2  ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM
+	// U+FC63 ef b1 a3  ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM
+	//
+	// U+FE70 ef b9 b0  ARABIC FATHATAN ISOLATED FORM
+	// U+FE71 ef b9 b1  ARABIC TATWEEL WITH FATHATAN ABOVE
+	// U+FE72 ef b9 b2  ARABIC DAMMATAN ISOLATED FORM
+	// U+FE73 ef b9 b3  ARABIC TAIL FRAGMENT
+	// U+FE74 ef b9 b4  ARABIC KASRATAN ISOLATED FORM
+	// U+FE75 ef b9 b5	 ???
+	// U+FE76 ef b9 b6  ARABIC FATHA ISOLATED FORM
+	// U+FE77 ef b9 b7  ARABIC FATHA MEDIAL FORM
+	// U+FE78 ef b9 b8  ARABIC DAMMA ISOLATED FORM
+	// U+FE79 ef b9 b9  ARABIC DAMMA MEDIAL FORM
+	// U+FE7A ef b9 ba  ARABIC KASRA ISOLATED FORM
+	// U+FE7B ef b9 bb  ARABIC KASRA MEDIAL FORM
+	// U+FE7C ef b9 bc  ARABIC SHADDA ISOLATED FORM
+	// U+FE7D ef b9 bd  ARABIC SHADDA MEDIAL FORM
+	// U+FE7E ef b9 be  ARABIC SUKUN ISOLATED FORM
+	// U+FE7F ef b9 bf  ARABIC SUKUN MEDIAL FORM
 
-char UTF8ArabicPoints::processText(SWBuf &text, const SWKey *key, const SWModule *module) {
-	if (!option) {
-		//The UTF-8 range 0xFC 0xE5 to 0xFC 0x63 consist of Arabic vowel marks so block those out.
-		// Also ranges 0xFE70 til OxFE7F and 0x064b-0x0655
-		SWBuf orig = text;
-		const unsigned char* from = (unsigned char*)orig.c_str();
-		for (text = ""; *from; from++) {
-			if ((*from == 0xD9) && (*(from + 1) >= 0x8B && *(from + 1) <= 0x95)) {
-				from++;
+	unsigned char* byte = (unsigned char*) from;
+	for (; *byte; ++byte) {
+		if (byte[0] == 0xD9) {
+			if (byte[1] >= 0x8B && byte[1] <= 0x95) {
+			  *mark_size = 2;
+			  break;
 			}
-			else if ((*from == 0xEF) &&
-				 (((*(from + 1) == 0xB1) && (*(from + 2) >= 0x9E && *(from + 2) <= 0xA3)) ||
-				  ((*(from + 1) == 0xB9) && (*(from + 2) >= 0x4B && *(from + 2) <= 0x55)))) {
-				from += 2;
-			}
+            continue;
 		}
+		if (byte[0] == 0xEF) {
+			if (byte[1] == 0xB1) {
+                if (byte[2] >= 0x9E && byte[2] <= 0xA3) {
+                    *mark_size = 3;
+                    break;
+                }
+                continue;
+            }
+			if (byte[1] == 0xB9) {
+                if (byte[2] >= 0xB0 && byte[2] <= 0xBF) {
+                    *mark_size = 3;
+                    break;
+                }
+                continue;
+            }
+		}
 	}
+	return (char*)byte;
+}
+
+
+char UTF8ArabicPoints::processText(SWBuf &text, const SWKey *, const SWModule *) {
+    // A non-zero/true option setting means that setOptionValue("On")
+    // was called which apparently means that Arabic Vowel Marks are ENABLED,
+	// so the filter's actions are DISABLED.
+	if (option)
+		return 0;
+
+	// Eliminate Arabic vowel marks from the text.
+	// The recognized marks are determined by the "next_mark" function.
+
+	// If next_mark were polymorphic (a virtual function or a function
+	// pointer), this function could be generically used in any filter that
+	// only removed (vs. replaced) areas of text based on the arbitrary
+	// match criteria encapsulated in the specific next_mark
+	// implementation.
+	int mark_size = 0;
+	char* mark_pos = next_mark(text.c_str(), &mark_size);
+
+	// Here and at the end of the loop,
+	// test BOTH mark_pos AND *mark_pos for safety and to give next_mark
+	// the option of returning either NULL or a pointer to the null
+	// terminator when done.
+	if (!mark_pos || !*mark_pos)
+		return 0; // no marks found.
+	
+	// Purposely granting write access into SWBuf internal buffer via
+	// "end_of_output" avoids a needless temporary SWBuf copy.
+	// Everything before the first mark is already in its final position
+	// and can be safely ignored. So start appending at the current mark.
+	char* end_of_output = mark_pos;
+
+	// For consistency, input starts at (vs. after) the first mark as well
+	// -- not a problem since the mark itself gets skipped, anyway.
+	const char* start_of_input = mark_pos;
+	do {
+		// At this point, "mark_pos" and "mark_pos+mark_size" delimit
+		// the text to drop.
+		// "start_of_input" is either mark_pos or any text between the
+        	// end of any previous mark and the current mark_pos.
+		// This text is now ready to be moved into the output.
+		int ready_size = mark_pos - start_of_input;
+		if (ready_size > 0) {
+			// Append the input text before the current mark to the
+			// output.
+			// Must use bcopy vs. strncpy because the final
+			// end_of_output may overtake the original
+			// start_of_input.
+			memmove(end_of_output, start_of_input, ready_size);
+			// Keep appending to end_of_output.
+			end_of_output += ready_size;
+		}
+		// Ensure the mark never gets copied.
+		start_of_input = mark_pos + mark_size;
+		// Find the next mark.
+		mark_pos = next_mark(start_of_input, &mark_size);
+
+	} while (mark_pos && *mark_pos); // No more marks.
+
+	// Copy any trailing input text AND always the terminating null.
+	memmove(end_of_output, start_of_input, strlen(start_of_input)+1);
 	return 0;
 }