Quelle unicode.ml Sprache: SML

(************************************************************************)
(*         *      The Rocq Prover / The Rocq Development Team           *)
(*  v      *         Copyright INRIA, CNRS and contributors             *)
(* <O___,, * (see version control and CREDITS file for authors & dates) *)
(*   \VV/  **************************************************************)
(*    //   *    This file is distributed under the terms of the         *)
(*         *     GNU Lesser General Public License Version 2.1          *)
(*         *     (see LICENSE file for the text of the license)         *)
(************************************************************************)

(** Unicode utilities *)

type status = Letter | IdentPart | Symbol | IdentSep | Separator | Control | Unknown

(* The following table stores classes of Unicode characters that
   are used by the lexer. There are 5 different classes so 3 bits
   are allocated for each character. We encode the masks of 8
   characters per word, thus using 24 bits over the 31 available
   bits. (This choice seems to be a good trade-off between speed
   and space after some benchmarks.) *)

(* A 256 KiB table, initially filled with zeros. *)
let table = Array.make (1 lsl 17) 0

(* Associate a 2-bit pattern to each status at position [i].
   Only the 3 lowest bits of [i] are taken into account to
   define the position of the pattern in the word.
   Notice that pattern "00" means "undefined". *)
let mask i = function
  | Letter    -> 1 lsl ((i land 7) * 3) (* 001 *)
  | IdentPart -> 2 lsl ((i land 7) * 3) (* 010 *)
  | Symbol    -> 3 lsl ((i land 7) * 3) (* 011 *)
  | IdentSep  -> 4 lsl ((i land 7) * 3) (* 100 *)
  | Separator -> 5 lsl ((i land 7) * 3) (* 101 *)
  | Control   -> 6 lsl ((i land 7) * 3) (* 110 *)
  | Unknown   -> 0 lsl ((i land 7) * 3) (* 000 *)

(* Helper to reset 3 bits in a word. *)
let reset_mask i =
  lnot (7 lsl ((i land 7) * 3))

(* Initialize the lookup table from a list of segments, assigning
   a status to every character of each segment. The order of these
   assignments is relevant: it is possible to assign status [s] to
   a segment [(c1, c2)] and later assign [s'] to [c] even if [c] is
   between [c1] and [c2]. *)
let mk_lookup_table_from_unicode_tables_for status tables =
  List.iter
    (List.iter
       (fun (c1, c2) ->
          for i = c1 to c2 do
            table.(i lsr 3) <-
              (table.(i lsr 3) land (reset_mask i)) lor (mask i status)
          done))
    tables

(* Look up into the table and interpret the found pattern. *)
let lookup x =
  let v = (table.(x lsr 3) lsr ((x land 7) * 3)) land 7 in
    if      v = 1 then Letter
    else if v = 2 then IdentPart
    else if v = 3 then Symbol
    else if v = 4 then IdentSep
    else if v = 5 then Separator
    else if v = 6 then Control
    else Unknown

(* [classify] discriminates between 5 different kinds of
   symbols based on the standard unicode classification (extracted from
   Camomile). *)
let classify =
  let single c = [ (c, c) ] in
    (* General tables. *)
    mk_lookup_table_from_unicode_tables_for Symbol
      [
        Unicodetable.sk;           (* Symbol, modifiers.                *)
        Unicodetable.sm;           (* Symbol, maths.                    *)
        Unicodetable.sc;           (* Symbol, currency.                 *)
        Unicodetable.so;           (* Symbol, modifier.                 *)
        Unicodetable.pd;           (* Punctuation, dash.                *)
        Unicodetable.pc;           (* Punctuation, connector.           *)
        Unicodetable.pe;           (* Punctuation, open.                *)
        Unicodetable.ps;           (* Punctution, close.                *)
        Unicodetable.pi;           (* Punctuation, initial quote.       *)
        Unicodetable.pf;           (* Punctuation, final quote.         *)
        Unicodetable.po;           (* Punctuation, other.               *)
      ];
    mk_lookup_table_from_unicode_tables_for Letter
      [
        Unicodetable.lu;           (* Letter, uppercase.                *)
        Unicodetable.ll;           (* Letter, lowercase.                *)
        Unicodetable.lt;           (* Letter, titlecase.                *)
        Unicodetable.lo;           (* Letter, others.                   *)
        Unicodetable.lm;           (* Letter, modifier.                 *)
      ];
    mk_lookup_table_from_unicode_tables_for IdentPart
      [
        Unicodetable.nd;           (* Number, decimal digits.           *)
        Unicodetable.nl;           (* Number, letter.                   *)
        Unicodetable.no;           (* Number, other.                    *)
        Unicodetable.mn;           (* Non-spacing marks.                *)
      ];

    (* Exceptions from Number, other.                                   *)
    mk_lookup_table_from_unicode_tables_for Symbol
      [
        [(0x000B2, 0x000B3)];      (* Superscript 2-3.                  *)
        single 0x000B9;            (* Superscript 1.                    *)
        single 0x02070;            (* Superscript 0.                    *)
        [(0x02074, 0x02079)];      (* Superscript 4-9.                  *)
      ];
    mk_lookup_table_from_unicode_tables_for Separator
      [
        Unicodetable.zs;           (* Separator, Space.                 *)
        Unicodetable.zl;           (* Separator, Line.                  *)
        Unicodetable.zp;           (* Separator, Paragraph.             *)
      ];
    mk_lookup_table_from_unicode_tables_for Control
      [
        Unicodetable.cc;           (* Other, Control.                   *)
      ];
    mk_lookup_table_from_unicode_tables_for IdentSep
      [
        single 0x005F;             (* Underscore.                       *)
        single 0x00A0;             (* Non breaking space, overrides Sep *)
      ];
    mk_lookup_table_from_unicode_tables_for IdentPart
      [
        single 0x0027;             (* Single quote.                     *)
      ];
    (* Lookup *)
    lookup

exception End_of_input

let utf8_of_unicode n =
  if n < 128 then
    String.make 1 (Char.chr n)
  else
    let (m,s) = if n < 2048 then (2,192) else if n < 65536 then (3,224) else (4,240) in
    String.init m (fun i ->
        let j = (n lsr ((m - 1 - i) * 6)) land 63 in
        Char.chr (j + if i = 0 then s else 128))

(* If [s] is some UTF-8 encoded string
   and [i] is a position of some UTF-8 character within [s]
   then [next_utf8 s i] returns [(j,n)] where:
   - [j] indicates the position of the next UTF-8 character
   - [n] represents the UTF-8 character at index [i] *)
let next_utf8 s i =
  let err () = invalid_arg "utf8" in
  let l = String.length s - i in
  if l = 0 then raise End_of_input
  else let a = Char.code s.[i] in if a <= 0x7F then
    1, a
  else if a land 0x40 = 0 || l = 1 then err ()
  else let b = Char.code s.[i+1] in if b land 0xC0 <> 0x80 then err ()
  else if a land 0x20 = 0 then
    2, (a land 0x1F) lsl 6 + (b land 0x3F)
  else if l = 2 then err ()
  else let c = Char.code s.[i+2] in if c land 0xC0 <> 0x80 then err ()
  else if a land 0x10 = 0 then
    3, (a land 0x0F) lsl 12 + (b land 0x3F) lsl 6 + (c land 0x3F)
  else if l = 3 then err ()
  else let d = Char.code s.[i+3] in if d land 0xC0 <> 0x80 then err ()
  else if a land 0x08 = 0 then
    4, (a land 0x07) lsl 18 + (b land 0x3F) lsl 12 +
       (c land 0x3F) lsl 6 + (d land 0x3F)
  else err ()

let is_utf8 s =
  let rec check i =
    let (off, _) = next_utf8 s i in
    check (i + off)
  in
  try check 0 with End_of_input -> true | Invalid_argument _ -> false

(* Escape string if it contains non-utf8 characters *)

let escaped_non_utf8 s =
  let mk_escape x = Printf.sprintf "%%%X" x in
  let buff = Buffer.create (String.length s * 3) in
  let rec process_trailing_aux i j =
    if i = j then i else
      match String.unsafe_get s i with
      | '\128'..'\191' -> process_trailing_aux (i+1) j
      | _ -> i in
  let process_trailing i n =
    let j = if i+n-1 >= String.length s then i+1 else process_trailing_aux (i+1) (i+n) in
    (if j = i+n then
      Buffer.add_string buff (String.sub s i n)
    else
      let v = Array.init (j-i) (fun k -> mk_escape (Char.code s.[i+k])) in
      Buffer.add_string buff (String.concat "" (Array.to_list v)));
    j in
  let rec process i =
    if i >= String.length s then Buffer.contents buff else
      let c = String.unsafe_get s i in
      match c with
      | '\000'..'\127' -> Buffer.add_char buff c; process (i+1)
      | '\128'..'\191' | '\248'..'\255' -> Buffer.add_string buff (mk_escape (Char.code c)); process (i+1)
      | '\192'..'\223' -> process (process_trailing i 2)
      | '\224'..'\239' -> process (process_trailing i 3)
      | '\240'..'\247' -> process (process_trailing i 4)
  in
  process 0

let escaped_if_non_utf8 s =
  if is_utf8 s then s else escaped_non_utf8 s

(* Check the well-formedness of an identifier *)

let is_valid_ident_initial = function
  | Letter | IdentSep -> true
  | IdentPart | Symbol | Separator | Control | Unknown -> false

let initial_refutation j n s =
  if is_valid_ident_initial (classify n) then None
  else
      let c = String.sub s 0 j in
      Some (false,
            "Invalid character '"^c^"' at beginning of identifier \""^s^"\".")

let is_valid_ident_trailing = function
  | Letter | IdentSep | IdentPart -> true
  | Symbol | Separator | Control | Unknown -> false

let trailing_refutation i j n s =
  if is_valid_ident_trailing (classify n) then None
  else
      let c = String.sub s i j in
      Some (false,
            "Invalid character '"^c^"' in identifier \""^s^"\".")

let is_unknown = function
  | Unknown -> true
  | Letter | IdentSep | IdentPart | Symbol | Separator | Control -> false

let is_ident_part = function
  | IdentPart -> true
  | Letter | IdentSep | Symbol | Unknown | Separator | Control -> false

let is_ident_sep = function
  | IdentSep -> true
  | Letter | IdentPart | Symbol | Unknown | Separator | Control -> false

let is_letter = function
  | Letter -> true
  | IdentSep | IdentPart | Symbol | Unknown | Separator | Control -> false

let ident_refutation s =
  if s = ".." then None else try
    let j, n = next_utf8 s 0 in
      match initial_refutation j n s with
        |None ->
           begin try
             let rec aux i =
               let j, n = next_utf8 s i in
                 match trailing_refutation i j n s with
                   |None -> aux (i + j)
                   |x -> x
             in aux j
           with End_of_input -> None
           end
        |x -> x
  with
  | End_of_input -> Some (true,"The empty string is not an identifier.")
  | Invalid_argument _ -> Some (true,escaped_non_utf8 s^": invalid utf8 sequence.")

let lowercase_unicode =
  let tree = Segmenttree.make Unicodetable.to_lower in
  fun unicode ->
    try
      match Segmenttree.lookup unicode tree with
        | `Abs c -> c
        | `Delta d -> unicode + d
    with Not_found -> unicode

let lowercase_first_char s =
  assert (s <> "");
  let j, n = next_utf8 s 0 in
  utf8_of_unicode (lowercase_unicode n)

let split_at_first_letter s =
  let n, v = next_utf8 s 0 in
  if ((* optim *) n = 1 && s.[0] != '_') || not (is_ident_sep (classify v)) then None
  else begin
    let n = ref n in
    let p = ref 0 in
    while !n < String.length s &&
          let n', v = next_utf8 s !n in
          p := n';
          (* Test if not letter *)
          ((* optim *) n' = 1 && (s.[!n] = '_' || s.[!n] = '\''))
          || let st = classify v in
             is_ident_sep st || is_ident_part st
    do n := !n + !p
    done;
    let s1 = String.sub s 0 !n in
    let s2 = String.sub s !n (String.length s - !n) in
    Some (s1,s2)
  end

(** For extraction, we need to encode unicode character into ascii ones *)

let is_basic_ascii s =
  let ok = ref true in
  String.iter (fun c -> if Char.code c >= 128 then ok := false) s;
  !ok

let ascii_of_ident s =
  let len = String.length s in
  let has_UU i =
    i+2 < len && s.[i]='_' && s.[i+1]='U' && s.[i+2]='U'
  in
  let i = ref 0 in
  while !i < len && Char.code s.[!i] < 128 && not (has_UU !i) do
    incr i
  done;
  if !i = len then s else
    let out = Buffer.create (2*len) in
    Buffer.add_substring out s 0 !i;
    while !i < len do
      let j, n = next_utf8 s !i in
      if n >= 128 then
        (Printf.bprintf out "_UU%04x_" n; i := !i + j)
      else if has_UU !i then
        (Buffer.add_string out "_UUU"; i := !i + 3)
      else
        (Buffer.add_char out s.[!i]; incr i)
    done;
    Buffer.contents out

(* Compute length of an UTF-8 encoded string
   Rem 1 : utf8_length <= String.length (equal if pure ascii)
   Rem 2 : if used for an iso8859_1 encoded string, the result is
   wrong in very rare cases. Such a wrong case corresponds to any
   sequence of a character in range 192..253 immediately followed by a
   character in range 128..191 (typical case in french is "déçu" which
   is counted 3 instead of 4); then no real harm to use always
   utf8_length even if using an iso8859_1 encoding *)

(** FIXME: duplicate code with Pp *)

let utf8_length s =
  let len = String.length s
  and cnt = ref 0
  and nc = ref 0
  and p = ref 0 in
  while !p < len do
    begin
      match s.[!p] with
      | '\000'..'\127' -> nc := 0 (* ascii char *)
      | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
      | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
      | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
      | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
      | '\248'..'\255' -> nc := 0 (* invalid byte *)
    end ;
    incr p ;
    while !p < len && !nc > 0 do
      match s.[!p] with
      | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
      | _ (* not a continuation byte *) -> nc := 0
    done ;
    incr cnt
  done ;
  !cnt

(* Variant of String.sub for UTF8 character positions *)
let utf8_sub s start_u len_u =
  let len_b = String.length s
  and end_u = start_u + len_u
  and cnt = ref 0
  and nc = ref 0
  and p = ref 0 in
  let start_b = ref len_b in
  while !p < len_b && !cnt < end_u do
    if !cnt <= start_u then start_b := !p ;
    begin
      match s.[!p] with
      | '\000'..'\127' -> nc := 0 (* ascii char *)
      | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
      | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
      | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
      | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
      | '\248'..'\255' -> nc := 0 (* invalid byte *)
    end ;
    incr p ;
    while !p < len_b && !nc > 0 do
      match s.[!p] with
      | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
      | _ (* not a continuation byte *) -> nc := 0
    done ;
    incr cnt
  done ;
  let end_b = !p in
  String.sub s !start_b (end_b - !start_b)

quality99%

¤ Dauer der Verarbeitung: 0.9 Sekunden ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung ist noch experimentell.