[sword-cvs] icu-sword/source/tools/genprops/misc ucdmerge.c,NONE,1.1 ucdstrip.c,NONE,1.1 ucdstrip.pl,NONE,1.1

sword@www.crosswire.org sword@www.crosswire.org
Tue, 9 Sep 2003 19:43:00 -0700


Update of /usr/local/cvsroot/icu-sword/source/tools/genprops/misc
In directory www:/tmp/cvs-serv19862/source/tools/genprops/misc

Added Files:
	ucdmerge.c ucdstrip.c ucdstrip.pl 
Log Message:
ICU 2.6 commit

--- NEW FILE: ucdmerge.c ---
/*
*******************************************************************************
*
*   Copyright (C) 2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  ucdmerge.c
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2003feb20
*   created by: Markus W. Scherer
*
*   Simple tool for Unicode Character Database files with semicolon-delimited fields.
*   Merges adjacent, identical per-code point data lines into one line with range syntax.
*
*   To compile, just call a C compiler/linker with this source file.
*   On Windows: cl ucdmerge.c
*/

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

static const char *
skipWhitespace(const char *s) {
    while(*s==' ' || *s=='\t') {
        ++s;
    }
    return s;
}

/* return the first character position after the end of the data */
static char *
endOfData(const char *l) {
    char *end;
    char c;

    end=strchr(l, '#');
    if(end!=NULL) {
        /* ignore whitespace before the comment */
        while(l!=end && ((c=*(end-1))==' ' || c=='\t')) {
            --end;
        }
    } else {
        end=strchr(l, 0);
    }
    return end;
}

static int
sameData(const char *l1, const char *l2) {
    char *end1, *end2;
    int length;

    /* find the first semicolon in each line - there must be one */
    l1=strchr(l1, ';')+1;
    l2=strchr(l2, ';')+1;

    /* find the end of data: end of string or start of comment */
    end1=endOfData(l1);
    end2=endOfData(l2);

    /* compare the line data portions */
    length=end1-l1;
    return length==(end2-l2) && 0==memcmp(l1, l2, length);
}

extern int
main(int argc, const char *argv[]) {
    static char line[2000], firstLine[2000], lastLine[2000];
    char *end;
    long first, last, c;
    int finished;

    first=last=-1;
    finished=0;

    for(;;) {
        if(gets(line)!=NULL) {
            /* parse the initial code point, if any */
            c=strtol(line, &end, 16);
            if(end!=line && *skipWhitespace(end)==';') {
                /* single code point followed by semicolon and data, keep c */
            } else {
                c=-1;
            }
        } else {
            line[0]=0;
            c=-1;
            finished=1;
        }

        if(last>=0 && (c!=(last+1) || !sameData(firstLine, line))) {
            /* output the current range */
            if(first==last) {
                /* there was no range, just output the one line we found */
                puts(firstLine);
            } else {
                /* there was a real range, merge their lines */
                end=strchr(lastLine, '#');
                if(end==NULL) {
                    /* no comment in second line */
                    printf("%04lX..%04lX%s\n",
                            first, last,            /* code point range */
                            strchr(firstLine, ';'));/* first line starting from the first ; */
                } else if(strchr(firstLine, '#')==NULL) {
                    /* no comment in first line */
                    printf("%04lX..%04lX%s%s\n",
                            first, last,            /* code point range */
                            strchr(firstLine, ';'), /* first line starting from the first ; */
                            end);                   /* comment from second line */
                } else {
                    /* merge comments from both lines */
                    printf("%04lX..%04lX%s..%s\n",
                            first, last,            /* code point range */
                            strchr(firstLine, ';'), /* first line starting from the first ; */
                            skipWhitespace(end+1)); /* comment from second line, after # and spaces */
                }
            }
            first=last=-1;
        }

        if(c<0) {
            if(finished) {
                break;
            }

            /* no data on this line, output as is */
            puts(line);
        } else {
            /* data on this line, store for possible range compaction */
            if(last<0) {
                /* set as the first line in a possible range */
                first=last=c;
                strcpy(firstLine, line);
                lastLine[0]=0;
            } else /* must be c==(last+1) && sameData() because of previous conditions */ {
                /* continue with the current range */
                last=c;
                strcpy(lastLine, line);
            }
        }
    }

    return 0;
}

--- NEW FILE: ucdstrip.c ---
/*
*******************************************************************************
*
*   Copyright (C) 2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  ucdstrip.c
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2003feb20
*   created by: Markus W. Scherer
*
*   Simple tool for Unicode Character Database files with semicolon-delimited fields.
*   Removes comments behind data lines but not in others.
*
*   To compile, just call a C compiler/linker with this source file.
*   On Windows: cl ucdstrip.c
*/

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

/* return the first character position after the end of the data */
static char *
endOfData(const char *l) {
    char *end;
    char c;

    end=strchr(l, '#');
    if(end!=NULL) {
        /* ignore whitespace before the comment */
        while(l!=end && ((c=*(end-1))==' ' || c=='\t')) {
            --end;
        }
    } else {
        end=strchr(l, 0);
    }
    return end;
}

extern int
main(int argc, const char *argv[]) {
    static char line[2000];
    char *end;

    while(gets(line)!=NULL) {
        if(strtol(line, &end, 16)>=0 && end!=line) {
            /* code point or range followed by semicolon and data, remove comment */
            *endOfData(line)=0;
        }
        puts(line);
    }

    return 0;
}

--- NEW FILE: ucdstrip.pl ---
#!/usr/lib/perl -p
# Copyright (c) 2001-2003 International Business Machines
# Corporation and others. All Rights Reserved.
# Simple tool for Unicode Character Database files with semicolon-delimited fields.
# Removes comments behind data lines but not in others.
# The Perl option -p above runs a while(<>) loop and prints the expression output.
s/^([0-9a-fA-F]+.+?) *#.*/\1/;