tzafrir
/
geresh
огледало од git://git.tzafrir.org.il/git/geresh.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
							// Copyright (C) 2003 Mooffie <mooffie@typo.co.il>
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.

#include <config.h>

#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>

#include "transtbl.h"
#include "io.h" // set_last_error
#include "dbg.h"

// Most of the code below deals with parsing a TranslationTable
// file. Such files consist of lines of the form:
//
//   <character-from> <character-to>
//
// that map character-from to character-to.
//
// <character-xxx> can be in one of three forms:
//
// 1. ' literal-character '
// 2. decimal-number .
// 3. hex-number
//
// Examples:
//
// 'a' 5d0    # maps 'a' to Hebrew letter Alef
// 'a' 1488.  # the same
// 'a' 'b'    # maps 'a' to 'b'
//
// literal-character is UTF-8 encoded.


// parse_next_char() - parses the next <character> token. (this is a
// misnomer, because one might think we mean C's "char".)
//
// If there was no lexical error, returns a pointer to the end of the
// token (so one can continue to parse the next token); else returns 
// NULL.

static char *parse_next_char(char *s, unichar &ch)
{
    while (*s == ' ' || *s == '\t')
	s++;
    if (!*s)
	return NULL;
    if (*s == '\'') {
	s++;
	char *end = strchr(s + 1, '\'');
	if (!end)
	    return NULL;
	unistring us;
	us.init_from_utf8(s, end - s);
	if (us.size() != 1)
	    return NULL;
	ch = us[0];
	return end + 1;
    } else {
	char *end;
	errno = 0;
	int val = strtol(s, &end, 16);
	if (*end == '.') {
	    *end = ' ';
	    val = strtol(s, &end, 10);
	}
	if (errno || (*end != '\0' && *end != ' ' && *end != '\t'))
	    return NULL;
	ch = (unichar)val;
	return end;
    }
}

// load(filename) - loads--that is, parse--a file. It reads the file line by
// line and for each line calls parse_next_char() to parse the two
// <character> tokens. It then adds the mapping to the map table.

bool TranslationTable::load(const char *filename)
{
#define MAX_LINE_LEN 1024
    charmap.clear();

    FILE *fp = fopen(filename, "r");
    if (!fp) {
	set_last_error(errno);
	return false;
    }
    DBG(1, ("Reading translation table %s\n", filename));

    char line[MAX_LINE_LEN];
    while (fgets(line, MAX_LINE_LEN, fp)) {
	int len = strlen(line);
	if (len && line[len-1] == '\n')
	    line[len-1] = 0;
	if (strchr(line, '#'))  // remove comment
	    *(strchr(line, '#')) = '\0';

	unichar ch1, ch2;
	char *s = line;
	if ((s = parse_next_char(s, ch1)))
	    if ((s = parse_next_char(s, ch2)))
		charmap[ch1] = ch2;
    }
    fclose(fp);

    return true;
#undef MAX_LINE_LEN
}

// translate_char() - matches a character with another, in-place. returns
// false if no match exists.

bool TranslationTable::translate_char(unichar &ch) const
{
    std::map<unichar, unichar>::const_iterator
		    it = charmap.find(ch);
    if (it != charmap.end()) {
	ch = it->second;
	return true;
    } else
	return false;
}