Quelle hashmgr.cxx Sprache: C

/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
* And Contributors.  All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
*    notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
*    notice, this list of conditions and the following disclaimer in the
*    documentation and/or other materials provided with the distribution.
*
* 3. All modifications to the source code must be clearly marked as
*    such.  Binary redistributions based on modified source code
*    must be clearly marked as modified versions in the documentation
*    and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/

#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <ctype.h>
#include <limits>
#include <sstream>

#include "hashmgr.hxx"
#include "csutil.hxx"
#include "atypes.hxx"
#include "langnum.hxx"

// build a hash table from a munched word list

HashMgr::HashMgr(const char* tpath, const char* apath, const char* key)
    : tablesize(0),
      tableptr(NULL),
      flag_mode(FLAG_CHAR),
      complexprefixes(0),
      utf8(0),
      forbiddenword(FORBIDDENWORD)  // forbidden word signing flag
      ,
      numaliasf(0),
      aliasf(NULL),
      aliasflen(0),
      numaliasm(0),
      aliasm(NULL) {
  langnum = 0;
  csconv = 0;
  load_config(apath, key);
  int ec = load_tables(tpath, key);
  if (ec) {
    /* error condition - what should we do here */
    HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n", ec);
    free(tableptr);
    //keep tablesize to 1 to fix possible division with zero
    tablesize = 1;
    tableptr = (struct hentry**)calloc(tablesize, sizeof(struct hentry*));
    if (!tableptr) {
      tablesize = 0;
    }
  }
}

HashMgr::~HashMgr() {
  if (tableptr) {
    // now pass through hash table freeing up everything
    // go through column by column of the table
    for (int i = 0; i < tablesize; i++) {
      struct hentry* pt = tableptr[i];
      struct hentry* nt = NULL;
      while (pt) {
        nt = pt->next;
        if (pt->astr &&
            (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen)))
          arena_free(pt->astr);
        arena_free(pt);
        pt = nt;
      }
    }
    free(tableptr);
  }
  tablesize = 0;

  if (aliasf) {
    for (int j = 0; j < (numaliasf); j++)
      arena_free(aliasf[j]);
    arena_free(aliasf);
    aliasf = NULL;
    if (aliasflen) {
      arena_free(aliasflen);
      aliasflen = NULL;
    }
  }
  if (aliasm) {
    for (int j = 0; j < (numaliasm); j++)
      arena_free(aliasm[j]);
    arena_free(aliasm);
    aliasm = NULL;
  }

#ifndef OPENOFFICEORG
#ifndef MOZILLA_CLIENT
  if (utf8)
    free_utf_tbl();
#endif
#endif

#ifdef MOZILLA_CLIENT
  delete[] csconv;
#endif

  assert(outstanding_arena_allocations == 0);
}

// lookup a root word in the hashtable

struct hentry* HashMgr::lookup(const char* word) const {
  struct hentry* dp;
  if (tableptr) {
    dp = tableptr[hash(word)];
    if (!dp)
      return NULL;
    for (; dp != NULL; dp = dp->next) {
      if (strcmp(word, dp->word) == 0)
        return dp;
    }
  }
  return NULL;
}

// add a word to the hash table (private)
int HashMgr::add_word(const std::string& in_word,
                      int wcl,
                      unsigned short* aff,
                      int al,
                      const std::string* in_desc,
                      bool onlyupcase,
                      int captype) {
  const std::string* word = &in_word;
  const std::string* desc = in_desc;

  std::string *word_copy = NULL;
  std::string *desc_copy = NULL;
  if ((!ignorechars.empty() && !has_no_ignored_chars(in_word, ignorechars)) || complexprefixes) {
    word_copy = new std::string(in_word);

    if (!ignorechars.empty()) {
      if (utf8) {
        wcl = remove_ignored_chars_utf(*word_copy, ignorechars_utf16);
      } else {
        remove_ignored_chars(*word_copy, ignorechars);
      }
    }

    if (complexprefixes) {
      if (utf8)
        wcl = reverseword_utf(*word_copy);
      else
        reverseword(*word_copy);

      if (in_desc && !aliasm) {
        desc_copy = new std::string(*in_desc);

        if (complexprefixes) {
          if (utf8)
            reverseword_utf(*desc_copy);
          else
            reverseword(*desc_copy);
        }
        desc = desc_copy;
      }
    }

    word = word_copy;
  }

  bool upcasehomonym = false;
  int descl = desc ? (aliasm ? sizeof(char*) : desc->size() + 1) : 0;
  // variable-length hash record with word and optional fields
  struct hentry* hp =
      (struct hentry*)arena_alloc(sizeof(struct hentry) + word->size() + descl);
  if (!hp) {
    delete desc_copy;
    delete word_copy;
    return 1;
  }

  char* hpw = hp->word;
  strcpy(hpw, word->c_str());

  int i = hash(hpw);

  hp->blen = (unsigned char)word->size();
  hp->clen = (unsigned char)wcl;
  hp->alen = (short)al;
  hp->astr = aff;
  hp->next = NULL;
  hp->next_homonym = NULL;
  hp->var = (captype == INITCAP) ? H_OPT_INITCAP : 0;

  // store the description string or its pointer
  if (desc) {
    hp->var |= H_OPT;
    if (aliasm) {
      hp->var |= H_OPT_ALIASM;
      store_pointer(hpw + word->size() + 1, get_aliasm(atoi(desc->c_str())));
    } else {
      strcpy(hpw + word->size() + 1, desc->c_str());
    }
    if (strstr(HENTRY_DATA(hp), MORPH_PHON)) {
      hp->var |= H_OPT_PHON;
      // store ph: fields (pronounciation, misspellings, old orthography etc.)
      // of a morphological description in reptable to use in REP replacements.
      if (reptable.capacity() < (unsigned int)(tablesize/MORPH_PHON_RATIO))
          reptable.reserve(tablesize/MORPH_PHON_RATIO);
      std::string fields = HENTRY_DATA(hp);
      std::string::const_iterator iter = fields.begin();
      std::string::const_iterator start_piece = mystrsep(fields, iter);
      while (start_piece != fields.end()) {
        if (std::string(start_piece, iter).find(MORPH_PHON) == 0) {
          std::string ph = std::string(start_piece, iter).substr(sizeof MORPH_PHON - 1);
          if (ph.size() > 0) {
            std::vector<w_char> w;
            size_t strippatt;
            std::string wordpart;
            // dictionary based REP replacement, separated by "->"
            // for example "pretty ph:prity ph:priti->pretti" to handle
            // both prity -> pretty and pritier -> prettiest suggestions.
            if (((strippatt = ph.find("->")) != std::string::npos) &&
                    (strippatt > 0) && (strippatt < ph.size() - 2)) {
                wordpart = ph.substr(strippatt + 2);
                ph.erase(ph.begin() + strippatt, ph.end());
            } else
                wordpart = in_word;
            // when the ph: field ends with the character *,
            // strip last character of the pattern and the replacement
            // to match in REP suggestions also at character changes,
            // for example, "pretty ph:prity*" results "prit->prett"
            // REP replacement instead of "prity->pretty", to get
            // prity->pretty and pritiest->prettiest suggestions.
            if (ph.at(ph.size()-1) == '*') {
              strippatt = 1;
              size_t stripword = 0;
              if (utf8) {
                while ((strippatt < ph.size()) &&
                  ((ph.at(ph.size()-strippatt-1) & 0xc0) == 0x80))
                     ++strippatt;
                while ((stripword < wordpart.size()) &&
                  ((wordpart.at(wordpart.size()-stripword-1) & 0xc0) == 0x80))
                     ++stripword;
              }
              ++strippatt;
              ++stripword;
              if ((ph.size() > strippatt) && (wordpart.size() > stripword)) {
                ph.erase(ph.size()-strippatt, strippatt);
                wordpart.erase(in_word.size()-stripword, stripword);
              }
            }
            // capitalize lowercase pattern for capitalized words to support
            // good suggestions also for capitalized misspellings, eg.
            // Wednesday ph:wendsay
            // results wendsay -> Wednesday and Wendsay -> Wednesday, too.
            if (captype==INITCAP) {
              std::string ph_capitalized;
              if (utf8) {
                u8_u16(w, ph);
                if (get_captype_utf8(w, langnum) == NOCAP) {
                  mkinitcap_utf(w, langnum);
                  u16_u8(ph_capitalized, w);
                }
              } else if (get_captype(ph, csconv) == NOCAP)
                  mkinitcap(ph_capitalized, csconv);

              if (ph_capitalized.size() > 0) {
                // add also lowercase word in the case of German or
                // Hungarian to support lowercase suggestions lowercased by
                // compound word generation or derivational suffixes
                // (for example by adjectival suffix "-i" of geographical
                // names in Hungarian:
                // Massachusetts ph:messzecsuzec
                // messzecsuzeci -> massachusettsi (adjective)
                // For lowercasing by conditional PFX rules, see
                // tests/germancompounding test example or the
                // Hungarian dictionary.)
                if (langnum == LANG_de || langnum == LANG_hu) {
                  std::string wordpart_lower(wordpart);
                  if (utf8) {
                    u8_u16(w, wordpart_lower);
                    mkallsmall_utf(w, langnum);
                    u16_u8(wordpart_lower, w);
                  } else {
                    mkallsmall(wordpart_lower, csconv);
                  }
                  reptable.push_back(replentry());
                  reptable.back().pattern.assign(ph);
                  reptable.back().outstrings[0].assign(wordpart_lower);
                }
                reptable.push_back(replentry());
                reptable.back().pattern.assign(ph_capitalized);
                reptable.back().outstrings[0].assign(wordpart);
              }
            }
            reptable.push_back(replentry());
            reptable.back().pattern.assign(ph);
            reptable.back().outstrings[0].assign(wordpart);
          }
        }
        start_piece = mystrsep(fields, iter);
      }
    }
  }

  struct hentry* dp = tableptr[i];
  if (!dp) {
    tableptr[i] = hp;
    delete desc_copy;
    delete word_copy;
    return 0;
  }
  while (dp->next != NULL) {
    if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) {
      // remove hidden onlyupcase homonym
      if (!onlyupcase) {
        if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
          arena_free(dp->astr);
          dp->astr = hp->astr;
          dp->alen = hp->alen;
          arena_free(hp);
          delete desc_copy;
          delete word_copy;
          return 0;
        } else {
          dp->next_homonym = hp;
        }
      } else {
        upcasehomonym = true;
      }
    }
    dp = dp->next;
  }
  if (strcmp(hp->word, dp->word) == 0) {
    // remove hidden onlyupcase homonym
    if (!onlyupcase) {
      if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
        arena_free(dp->astr);
        dp->astr = hp->astr;
        dp->alen = hp->alen;
        arena_free(hp);
        delete desc_copy;
        delete word_copy;
        return 0;
      } else {
        dp->next_homonym = hp;
      }
    } else {
      upcasehomonym = true;
    }
  }
  if (!upcasehomonym) {
    dp->next = hp;
  } else {
    // remove hidden onlyupcase homonym
    if (hp->astr)
      arena_free(hp->astr);
    arena_free(hp);
  }

  delete desc_copy;
  delete word_copy;
  return 0;
}

int HashMgr::add_hidden_capitalized_word(const std::string& word,
                                         int wcl,
                                         unsigned short* flags,
                                         int flagslen,
                                         const std::string* dp,
                                         int captype) {
  if (flags == NULL)
    flagslen = 0;

  // add inner capitalized forms to handle the following allcap forms:
  // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG
  // Allcaps with suffixes: CIA's -> CIA'S
  if (((captype == HUHCAP) || (captype == HUHINITCAP) ||
       ((captype == ALLCAP) && (flagslen != 0))) &&
      !((flagslen != 0) && TESTAFF(flags, forbiddenword, flagslen))) {
    unsigned short* flags2 =
        (unsigned short*)arena_alloc(sizeof(unsigned short) * (flagslen + 1));
    if (!flags2)
      return 1;
    if (flagslen)
      memcpy(flags2, flags, flagslen * sizeof(unsigned short));
    flags2[flagslen] = ONLYUPCASEFLAG;
    if (utf8) {
      std::string st;
      std::vector<w_char> w;
      u8_u16(w, word);
      mkallsmall_utf(w, langnum);
      mkinitcap_utf(w, langnum);
      u16_u8(st, w);
      return add_word(st, wcl, flags2, flagslen + 1, dp, true, INITCAP);
    } else {
      std::string new_word(word);
      mkallsmall(new_word, csconv);
      mkinitcap(new_word, csconv);
      int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true, INITCAP);
      return ret;
    }
  }
  return 0;
}

// detect captype and modify word length for UTF-8 encoding
int HashMgr::get_clen_and_captype(const std::string& word, int* captype, std::vector<w_char> &workbuf) {
  int len;
  if (utf8) {
    len = u8_u16(workbuf, word);
    *captype = get_captype_utf8(workbuf, langnum);
  } else {
    len = word.size();
    *captype = get_captype(word, csconv);
  }
  return len;
}

int HashMgr::get_clen_and_captype(const std::string& word, int* captype) {
  std::vector<w_char> workbuf;
  return get_clen_and_captype(word, captype, workbuf);
}

// remove word (personal dictionary function for standalone applications)
int HashMgr::remove(const std::string& word) {
  struct hentry* dp = lookup(word.c_str());
  while (dp) {
    if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) {
      unsigned short* flags =
          (unsigned short*)arena_alloc(sizeof(unsigned short) * (dp->alen + 1));
      if (!flags)
        return 1;
      for (int i = 0; i < dp->alen; i++)
        flags[i] = dp->astr[i];
      flags[dp->alen] = forbiddenword;
      arena_free(dp->astr);
      dp->astr = flags;
      dp->alen++;
      std::sort(flags, flags + dp->alen);
    }
    dp = dp->next_homonym;
  }
  return 0;
}

/* remove forbidden flag to add a personal word to the hash */
int HashMgr::remove_forbidden_flag(const std::string& word) {
  struct hentry* dp = lookup(word.c_str());
  if (!dp)
    return 1;
  while (dp) {
    if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen))
      dp->alen = 0;  // XXX forbidden words of personal dic.
    dp = dp->next_homonym;
  }
  return 0;
}

// add a custom dic. word to the hash table (public)
int HashMgr::add(const std::string& word) {
  if (remove_forbidden_flag(word)) {
    int captype;
    int al = 0;
    unsigned short* flags = NULL;
    int wcl = get_clen_and_captype(word, &captype);
    add_word(word, wcl, flags, al, NULL, false, captype);
    return add_hidden_capitalized_word(word, wcl, flags, al, NULL,
                                       captype);
  }
  return 0;
}

int HashMgr::add_with_affix(const std::string& word, const std::string& example) {
  // detect captype and modify word length for UTF-8 encoding
  struct hentry* dp = lookup(example.c_str());
  remove_forbidden_flag(word);
  if (dp && dp->astr) {
    int captype;
    int wcl = get_clen_and_captype(word, &captype);
    if (aliasf) {
      add_word(word, wcl, dp->astr, dp->alen, NULL, false, captype);
    } else {
      unsigned short* flags =
          (unsigned short*) arena_alloc(dp->alen * sizeof(unsigned short));
      if (flags) {
        memcpy((void*)flags, (void*)dp->astr,
               dp->alen * sizeof(unsigned short));
        add_word(word, wcl, flags, dp->alen, NULL, false, captype);
      } else
        return 1;
    }
    return add_hidden_capitalized_word(word, wcl, dp->astr,
                                       dp->alen, NULL, captype);
  }
  return 1;
}

// walk the hash table entry by entry - null at end
// initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp);
struct hentry* HashMgr::walk_hashtable(int& col, struct hentry* hp) const {
  if (hp && hp->next != NULL)
    return hp->next;
  for (col++; col < tablesize; col++) {
    if (tableptr[col])
      return tableptr[col];
  }
  // null at end and reset to start
  col = -1;
  return NULL;
}

// load a munched word list and build a hash table on the fly
int HashMgr::load_tables(const char* tpath, const char* key) {
  // open dictionary file
  FileMgr* dict = new FileMgr(tpath, key);
  if (dict == NULL)
    return 1;

  // first read the first line of file to get hash table size */
  std::string ts;
  if (!dict->getline(ts)) {
    HUNSPELL_WARNING(stderr, "error: empty dic file %s\n", tpath);
    delete dict;
    return 2;
  }
  mychomp(ts);

  /* remove byte order mark */
  if (ts.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
    ts.erase(0, 3);
  }

  tablesize = atoi(ts.c_str());

  int nExtra = 5 + USERWORD;

  if (tablesize <= 0 ||
      (tablesize >= (std::numeric_limits<int>::max() - 1 - nExtra) /
                        int(sizeof(struct hentry*)))) {
    HUNSPELL_WARNING(
        stderr, "error: line 1: missing or bad word count in the dic file\n");
    delete dict;
    return 4;
  }
  tablesize += nExtra;
  if ((tablesize % 2) == 0)
    tablesize++;

  // allocate the hash table
  tableptr = (struct hentry**)calloc(tablesize, sizeof(struct hentry*));
  if (!tableptr) {
    delete dict;
    return 3;
  }

  // loop through all words on much list and add to hash
  // table and create word and affix strings

  std::vector<w_char> workbuf;

  while (dict->getline(ts)) {
    mychomp(ts);
    // split each line into word and morphological description
    size_t dp_pos = 0;
    while ((dp_pos = ts.find(':', dp_pos)) != std::string::npos) {
      if ((dp_pos > 3) && (ts[dp_pos - 3] == ' ' || ts[dp_pos - 3] == '\t')) {
        for (dp_pos -= 3; dp_pos > 0 && (ts[dp_pos-1] == ' ' || ts[dp_pos-1] == '\t'); --dp_pos)
          ;
        if (dp_pos == 0) {  // missing word
          dp_pos = std::string::npos;
        } else {
          ++dp_pos;
        }
        break;
      }
      ++dp_pos;
    }

    // tabulator is the old morphological field separator
    size_t dp2_pos = ts.find('\t');
    if (dp2_pos != std::string::npos && (dp_pos == std::string::npos || dp2_pos < dp_pos)) {
      dp_pos = dp2_pos + 1;
    }

    std::string dp;
    if (dp_pos != std::string::npos) {
      dp.assign(ts.substr(dp_pos));
      ts.resize(dp_pos - 1);
    }

    // split each line into word and affix char strings
    // "\/" signs slash in words (not affix separator)
    // "/" at beginning of the line is word character (not affix separator)
    size_t ap_pos = ts.find('/');
    while (ap_pos != std::string::npos) {
      if (ap_pos == 0) {
        ++ap_pos;
        continue;
      } else if (ts[ap_pos - 1] != '\\')
        break;
      // replace "\/" with "/"
      ts.erase(ap_pos - 1, 1);
      ap_pos = ts.find('/', ap_pos);
    }

    unsigned short* flags;
    int al;
    if (ap_pos != std::string::npos && ap_pos != ts.size()) {
      std::string ap(ts.substr(ap_pos + 1));
      ts.resize(ap_pos);
      if (aliasf) {
        int index = atoi(ap.c_str());
        al = get_aliasf(index, &flags, dict);
        if (!al) {
          HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n",
                           dict->getlinenum());
        }
      } else {
        al = decode_flags(&flags, ap.c_str(), dict, /* arena = */ true);
        if (al == -1) {
          HUNSPELL_WARNING(stderr, "Can't allocate memory.\n");
          delete dict;
          return 6;
        }
        std::sort(flags, flags + al);
      }
    } else {
      al = 0;
      flags = NULL;
    }

    int captype;
    int wcl = get_clen_and_captype(ts, &captype, workbuf);
    const std::string *dp_str = dp.empty() ? NULL : &dp;
    // add the word and its index plus its capitalized form optionally
    if (add_word(ts, wcl, flags, al, dp_str, false, captype) ||
        add_hidden_capitalized_word(ts, wcl, flags, al, dp_str, captype)) {
      delete dict;
      return 5;
    }
  }

  delete dict;
  return 0;
}

// the hash function is a simple load and rotate
// algorithm borrowed
int HashMgr::hash(const char* word) const {
  unsigned long hv = 0;
  for (int i = 0; i < 4 && *word != 0; i++)
    hv = (hv << 8) | (*word++);
  while (*word != 0) {
    ROTATE(hv, ROTATE_LEN);
    hv ^= (*word++);
  }
  return (unsigned long)hv % tablesize;
}

int HashMgr::decode_flags(unsigned short** result, const std::string& flags, FileMgr* af, bool arena) const {
  auto alloc = [arena, this](int n) { return arena ? this->arena_alloc(n) : malloc(n); };
  int len;
  if (flags.empty()) {
    *result = NULL;
    return 0;
  }
  switch (flag_mode) {
    case FLAG_LONG: {  // two-character flags (1x2yZz -> 1x 2y Zz)
      len = flags.size();
      if (len % 2 == 1)
        HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n",
                         af->getlinenum());
      len /= 2;
      *result = (unsigned short*)alloc(len * sizeof(unsigned short));
      if (!*result)
        return -1;
      for (int i = 0; i < len; i++) {
        (*result)[i] = ((unsigned short)((unsigned char)flags[i * 2]) << 8) +
                       (unsigned char)flags[i * 2 + 1];
      }
      break;
    }
    case FLAG_NUM: {  // decimal numbers separated by comma (4521,23,233 -> 4521
                      // 23 233)
      len = 1;
      unsigned short* dest;
      for (size_t i = 0; i < flags.size(); ++i) {
        if (flags[i] == ',')
          len++;
      }
      *result = (unsigned short*)alloc(len * sizeof(unsigned short));
      if (!*result)
        return -1;
      dest = *result;
      const char* src = flags.c_str();
      for (const char* p = src; *p; p++) {
        if (*p == ',') {
          int i = atoi(src);
          if (i >= DEFAULTFLAGS)
            HUNSPELL_WARNING(
                stderr, "error: line %d: flag id %d is too large (max: %d)\n",
                af->getlinenum(), i, DEFAULTFLAGS - 1);
          *dest = (unsigned short)i;
          if (*dest == 0)
            HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
                             af->getlinenum());
          src = p + 1;
          dest++;
        }
      }
      int i = atoi(src);
      if (i >= DEFAULTFLAGS)
        HUNSPELL_WARNING(stderr,
                         "error: line %d: flag id %d is too large (max: %d)\n",
                         af->getlinenum(), i, DEFAULTFLAGS - 1);
      *dest = (unsigned short)i;
      if (*dest == 0)
        HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
                         af->getlinenum());
      break;
    }
    case FLAG_UNI: {  // UTF-8 characters
      std::vector<w_char> w;
      u8_u16(w, flags);
      len = w.size();
      *result = (unsigned short*)alloc(len * sizeof(unsigned short));
      if (!*result)
        return -1;
      memcpy(*result, w.data(), len * sizeof(short));
      break;
    }
    default: {  // Ispell's one-character flags (erfg -> e r f g)
      unsigned short* dest;
      len = flags.size();
      *result = (unsigned short*)alloc(len * sizeof(unsigned short));
      if (!*result)
        return -1;
      dest = *result;
      for (size_t i = 0; i < flags.size(); ++i) {
        *dest = (unsigned char)flags[i];
        dest++;
      }
    }
  }
  return len;
}

bool HashMgr::decode_flags(std::vector<unsigned short>& result, const std::string& flags, FileMgr* af) const {
  if (flags.empty()) {
    return false;
  }
  switch (flag_mode) {
    case FLAG_LONG: {  // two-character flags (1x2yZz -> 1x 2y Zz)
      size_t len = flags.size();
      if (len % 2 == 1)
        HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n",
                         af->getlinenum());
      len /= 2;
      result.reserve(result.size() + len);
      for (size_t i = 0; i < len; ++i) {
        result.push_back(((unsigned short)((unsigned char)flags[i * 2]) << 8) +
                         (unsigned char)flags[i * 2 + 1]);
      }
      break;
    }
    case FLAG_NUM: {  // decimal numbers separated by comma (4521,23,233 -> 4521
                      // 23 233)
      const char* src = flags.c_str();
      for (const char* p = src; *p; p++) {
        if (*p == ',') {
          int i = atoi(src);
          if (i >= DEFAULTFLAGS)
            HUNSPELL_WARNING(
                stderr, "error: line %d: flag id %d is too large (max: %d)\n",
                af->getlinenum(), i, DEFAULTFLAGS - 1);
          result.push_back((unsigned short)i);
          if (result.back() == 0)
            HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
                             af->getlinenum());
          src = p + 1;
        }
      }
      int i = atoi(src);
      if (i >= DEFAULTFLAGS)
        HUNSPELL_WARNING(stderr,
                         "error: line %d: flag id %d is too large (max: %d)\n",
                         af->getlinenum(), i, DEFAULTFLAGS - 1);
      result.push_back((unsigned short)i);
      if (result.back() == 0)
        HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
                         af->getlinenum());
      break;
    }
    case FLAG_UNI: {  // UTF-8 characters
      std::vector<w_char> w;
      u8_u16(w, flags);
      size_t len = w.size();
      size_t origsize = result.size();
      result.resize(origsize + len);
      memcpy(result.data() + origsize, w.data(), len * sizeof(short));
      break;
    }
    default: {  // Ispell's one-character flags (erfg -> e r f g)
      result.reserve(flags.size());
      for (size_t i = 0; i < flags.size(); ++i) {
        result.push_back((unsigned char)flags[i]);
      }
    }
  }
  return true;
}

unsigned short HashMgr::decode_flag(const char* f) const {
  unsigned short s = 0;
  int i;
  switch (flag_mode) {
    case FLAG_LONG:
      s = ((unsigned short)((unsigned char)f[0]) << 8) + (unsigned char)f[1];
      break;
    case FLAG_NUM:
      i = atoi(f);
      if (i >= DEFAULTFLAGS)
        HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n",
                         i, DEFAULTFLAGS - 1);
      s = (unsigned short)i;
      break;
    case FLAG_UNI: {
      std::vector<w_char> w;
      u8_u16(w, f);
      if (!w.empty())
          memcpy(&s, w.data(), 1 * sizeof(short));
      break;
    }
    default:
      s = *(unsigned char*)f;
  }
  if (s == 0)
    HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
  return s;
}

// This function is only called by external consumers, and so using the default
// allocator with mystrdup is correct.
char* HashMgr::encode_flag(unsigned short f) const {
  if (f == 0)
    return mystrdup("(NULL)");
  std::string ch;
  if (flag_mode == FLAG_LONG) {
    ch.push_back((unsigned char)(f >> 8));
    ch.push_back((unsigned char)(f - ((f >> 8) << 8)));
  } else if (flag_mode == FLAG_NUM) {
    std::ostringstream stream;
    stream << f;
    ch = stream.str();
  } else if (flag_mode == FLAG_UNI) {
    const w_char* w_c = (const w_char*)&f;
    std::vector<w_char> w(w_c, w_c + 1);
    u16_u8(ch, w);
  } else {
    ch.push_back((unsigned char)(f));
  }
  return mystrdup(ch.c_str());
}

// read in aff file and set flag mode
int HashMgr::load_config(const char* affpath, const char* key) {
  int firstline = 1;

  // open the affix file
  FileMgr* afflst = new FileMgr(affpath, key);
  if (!afflst) {
    HUNSPELL_WARNING(
        stderr, "Error - could not open affix description file %s\n", affpath);
    return 1;
  }

  // read in each line ignoring any that do not
  // start with a known line type indicator

  std::string line;
  while (afflst->getline(line)) {
    mychomp(line);

    /* remove byte order mark */
    if (firstline) {
      firstline = 0;
      if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
        line.erase(0, 3);
      }
    }

    /* parse in the try string */
    if ((line.compare(0, 4, "FLAG", 4) == 0) && line.size() > 4 && isspace(line[4])) {
      if (flag_mode != FLAG_CHAR) {
        HUNSPELL_WARNING(stderr,
                         "error: line %d: multiple definitions of the FLAG "
                         "affix file parameter\n",
                         afflst->getlinenum());
      }
      if (line.find("long") != std::string::npos)
        flag_mode = FLAG_LONG;
      if (line.find("num") != std::string::npos)
        flag_mode = FLAG_NUM;
      if (line.find("UTF-8") != std::string::npos)
        flag_mode = FLAG_UNI;
      if (flag_mode == FLAG_CHAR) {
        HUNSPELL_WARNING(
            stderr,
            "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n",
            afflst->getlinenum());
      }
    }

    if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) {
      std::string st;
      if (!parse_string(line, st, afflst->getlinenum())) {
        delete afflst;
        return 1;
      }
      forbiddenword = decode_flag(st.c_str());
    }

    if (line.compare(0, 3, "SET", 3) == 0) {
      if (!parse_string(line, enc, afflst->getlinenum())) {
        delete afflst;
        return 1;
      }
      if (enc == "UTF-8") {
        utf8 = 1;
#ifndef OPENOFFICEORG
#ifndef MOZILLA_CLIENT
        initialize_utf_tbl();
#endif
#endif
      } else
        csconv = get_current_cs(enc);
    }

    if (line.compare(0, 4, "LANG", 4) == 0) {
      if (!parse_string(line, lang, afflst->getlinenum())) {
        delete afflst;
        return 1;
      }
      langnum = get_lang_num(lang);
    }

    /* parse in the ignored characters (for example, Arabic optional diacritics
     * characters */
    if (line.compare(0, 6, "IGNORE", 6) == 0) {
      if (!parse_array(line, ignorechars, ignorechars_utf16,
                       utf8, afflst->getlinenum())) {
        delete afflst;
        return 1;
      }
    }

    if ((line.compare(0, 2, "AF", 2) == 0) && line.size() > 2 && isspace(line[2])) {
      if (!parse_aliasf(line, afflst)) {
        delete afflst;
        return 1;
      }
    }

    if ((line.compare(0, 2, "AM", 2) == 0) && line.size() > 2 && isspace(line[2])) {
      if (!parse_aliasm(line, afflst)) {
        delete afflst;
        return 1;
      }
    }

    if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
      complexprefixes = 1;

    /* parse in the typical fault correcting table */
    if (line.compare(0, 3, "REP", 3) == 0) {
      if (!parse_reptable(line, afflst)) {
        delete afflst;
        return 1;
      }
    }

    // don't check the full affix file, yet
    if (((line.compare(0, 3, "SFX", 3) == 0) ||
         (line.compare(0, 3, "PFX", 3) == 0)) &&
            line.size() > 3 && isspace(line[3]) &&
            !reptable.empty()) // (REP table is in the end of Afrikaans aff file)
      break;
  }

  if (csconv == NULL)
    csconv = get_current_cs(SPELL_ENCODING);
  delete afflst;
  return 0;
}

/* parse in the ALIAS table */
bool HashMgr::parse_aliasf(const std::string& line, FileMgr* af) {
  if (numaliasf != 0) {
    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
                     af->getlinenum());
    return false;
  }
  int i = 0;
  int np = 0;
  std::string::const_iterator iter = line.begin();
  std::string::const_iterator start_piece = mystrsep(line, iter);
  while (start_piece != line.end()) {
    switch (i) {
      case 0: {
        np++;
        break;
      }
      case 1: {
        numaliasf = atoi(std::string(start_piece, iter).c_str());
        if (numaliasf < 1) {
          numaliasf = 0;
          aliasf = NULL;
          aliasflen = NULL;
          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
                           af->getlinenum());
          return false;
        }
        aliasf =
            (unsigned short**)arena_alloc(numaliasf * sizeof(unsigned short*));
        aliasflen =
            (unsigned short*)arena_alloc(numaliasf * sizeof(unsigned short));
        if (!aliasf || !aliasflen) {
          numaliasf = 0;
          if (aliasf)
            arena_free(aliasf);
          if (aliasflen)
            arena_free(aliasflen);
          aliasf = NULL;
          aliasflen = NULL;
          return false;
        }
        np++;
        break;
      }
      default:
        break;
    }
    ++i;
    start_piece = mystrsep(line, iter);
  }
  if (np != 2) {
    numaliasf = 0;
    arena_free(aliasf);
    arena_free(aliasflen);
    aliasf = NULL;
    aliasflen = NULL;
    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
                     af->getlinenum());
    return false;
  }

  /* now parse the numaliasf lines to read in the remainder of the table */
  for (int j = 0; j < numaliasf; j++) {
    std::string nl;
    aliasf[j] = NULL;
    aliasflen[j] = 0;
    i = 0;
    if (af->getline(nl)) {
      mychomp(nl);
      iter = nl.begin();
      start_piece = mystrsep(nl, iter);
      bool errored = false;
      while (!errored && start_piece != nl.end()) {
        switch (i) {
          case 0: {
            if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) {
              errored = true;
              break;
            }
            break;
          }
          case 1: {
            std::string piece(start_piece, iter);
            aliasflen[j] =
                (unsigned short)decode_flags(&(aliasf[j]), piece, af, /* arena = */ true);
            std::sort(aliasf[j], aliasf[j] + aliasflen[j]);
            break;
          }
          default:
            break;
        }
        ++i;
        start_piece = mystrsep(nl, iter);
      }
    }
    if (!aliasf[j]) {
      for (int k = 0; k < j; ++k) {
        arena_free(aliasf[k]);
      }
      arena_free(aliasf);
      arena_free(aliasflen);
      aliasf = NULL;
      aliasflen = NULL;
      numaliasf = 0;
      HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
                       af->getlinenum());
      return false;
    }
  }
  return true;
}

int HashMgr::is_aliasf() const {
  return (aliasf != NULL);
}

int HashMgr::get_aliasf(int index, unsigned short** fvec, FileMgr* af) const {
  if ((index > 0) && (index <= numaliasf)) {
    *fvec = aliasf[index - 1];
    return aliasflen[index - 1];
  }
  HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n",
                   af->getlinenum(), index);
  *fvec = NULL;
  return 0;
}

/* parse morph alias definitions */
bool HashMgr::parse_aliasm(const std::string& line, FileMgr* af) {
  if (numaliasm != 0) {
    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
                     af->getlinenum());
    return false;
  }
  int i = 0;
  int np = 0;
  std::string::const_iterator iter = line.begin();
  std::string::const_iterator start_piece = mystrsep(line, iter);
  while (start_piece != line.end()) {
    switch (i) {
      case 0: {
        np++;
        break;
      }
      case 1: {
        numaliasm = atoi(std::string(start_piece, iter).c_str());
        if (numaliasm < 1) {
          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
                           af->getlinenum());
          return false;
        }
        aliasm = (char**)arena_alloc(numaliasm * sizeof(char*));
        if (!aliasm) {
          numaliasm = 0;
          return false;
        }
        np++;
        break;
      }
      default:
        break;
    }
    ++i;
    start_piece = mystrsep(line, iter);
  }
  if (np != 2) {
    numaliasm = 0;
    arena_free(aliasm);
    aliasm = NULL;
    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
                     af->getlinenum());
    return false;
  }

  /* now parse the numaliasm lines to read in the remainder of the table */
  for (int j = 0; j < numaliasm; j++) {
    std::string nl;
    aliasm[j] = NULL;
    if (af->getline(nl)) {
      mychomp(nl);
      iter = nl.begin();
      i = 0;
      start_piece = mystrsep(nl, iter);
      bool errored = false;
      while (!errored && start_piece != nl.end()) {
        switch (i) {
          case 0: {
            if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) {
              errored = true;
              break;
            }
            break;
          }
          case 1: {
            // add the remaining of the line
            std::string::const_iterator end = nl.end();
            std::string chunk(start_piece, end);
            if (complexprefixes) {
              if (utf8)
                reverseword_utf(chunk);
              else
                reverseword(chunk);
            }
            size_t sl = chunk.length() + 1;
            aliasm[j] = (char*)arena_alloc(sl);
            if (aliasm[j]) {
              memcpy(aliasm[j], chunk.c_str(), sl);
            }
            break;
          }
          default:
            break;
        }
        ++i;
        start_piece = mystrsep(nl, iter);
      }
    }
    if (!aliasm[j]) {
      numaliasm = 0;
      for (int k = 0; k < j; ++k) {
        arena_free(aliasm[k]);
      }
      arena_free(aliasm);
      aliasm = NULL;
      HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
                       af->getlinenum());
      return false;
    }
  }
  return true;
}

int HashMgr::is_aliasm() const {
  return (aliasm != NULL);
}

char* HashMgr::get_aliasm(int index) const {
  if ((index > 0) && (index <= numaliasm))
    return aliasm[index - 1];
  HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
  return NULL;
}

/* parse in the typical fault correcting table */
bool HashMgr::parse_reptable(const std::string& line, FileMgr* af) {
  if (!reptable.empty()) {
    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
                     af->getlinenum());
    return false;
  }
  int numrep = -1;
  int i = 0;
  int np = 0;
  std::string::const_iterator iter = line.begin();
  std::string::const_iterator start_piece = mystrsep(line, iter);
  while (start_piece != line.end()) {
    switch (i) {
      case 0: {
        np++;
        break;
      }
      case 1: {
        numrep = atoi(std::string(start_piece, iter).c_str());
        if (numrep < 1) {
          HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
                           af->getlinenum());
          return false;
        }
        reptable.reserve(numrep);
        np++;
        break;
      }
      default:
        break;
    }
    ++i;
    start_piece = mystrsep(line, iter);
  }
  if (np != 2) {
    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
                     af->getlinenum());
    return false;
  }

  /* now parse the numrep lines to read in the remainder of the table */
  for (int j = 0; j < numrep; ++j) {
    std::string nl;
    reptable.push_back(replentry());
    int type = 0;
    if (af->getline(nl)) {
      mychomp(nl);
      iter = nl.begin();
      i = 0;
      start_piece = mystrsep(nl, iter);
      bool errored = false;
      while (!errored && start_piece != nl.end()) {
        switch (i) {
          case 0: {
            if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) {
              errored = true;
              break;
            }
            break;
          }
          case 1: {
            if (*start_piece == '^')
              type = 1;
            reptable.back().pattern.assign(start_piece + type, iter);
            mystrrep(reptable.back().pattern, "_", " ");
            if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') {
              type += 2;
              reptable.back().pattern.resize(reptable.back().pattern.size() - 1);
            }
            break;
          }
          case 2: {
            reptable.back().outstrings[type].assign(start_piece, iter);
            mystrrep(reptable.back().outstrings[type], "_", " ");
            break;
          }
          default:
            break;
        }
        ++i;
        start_piece = mystrsep(nl, iter);
      }
    }
    if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) {
      HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
                       af->getlinenum());
      reptable.clear();
      return false;
    }
  }
  return true;
}

// return replacing table
const std::vector<replentry>& HashMgr::get_reptable() const {
  return reptable;
}

void* HashMgr::arena_alloc(int num_bytes) {
  static const int MIN_CHUNK_SIZE = 4096;
  if (arena.empty() || (current_chunk_size - current_chunk_offset < num_bytes)) {
    current_chunk_size = std::max(MIN_CHUNK_SIZE, num_bytes);
    arena.push_back(std::make_unique<uint8_t[]>(current_chunk_size));
    current_chunk_offset = 0;
  }

  uint8_t* ptr = &arena.back()[current_chunk_offset];
  current_chunk_offset += num_bytes;
  outstanding_arena_allocations++;
  return ptr;
}

void HashMgr::arena_free(void* ptr) {
  --outstanding_arena_allocations;
  assert(outstanding_arena_allocations >= 0);
}

Messung V0.5

¤ Dauer der Verarbeitung: 0.50 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.