Fuzion • APIs • Standard APIs • codepoint.fz
codepoint.fz


# This file is part of the Fuzion language implementation.
#
# The Fuzion language implementation is free software: you can redistribute it
# and/or modify it under the terms of the GNU General Public License as published
# by the Free Software Foundation, version 3 of the License.
#
# The Fuzion language implementation is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
# License for more details.
#
# You should have received a copy of the GNU General Public License along with The
# Fuzion language implementation.  If not, see <https://www.gnu.org/licenses/>.


# -----------------------------------------------------------------------
#
#  Tokiwa Software GmbH, Germany
#
#  Source code of Fuzion standard library feature codepoint
#
#  Author: Fridtjof Siebert (siebert@tokiwa.software)
#
# -----------------------------------------------------------------------

# codepoint -- represents a unicode codepoint
#
public codepoint(public val u32) : String
  pre
    debug: (codepoint.range.contains val)
    debug: !codepoint.utf16_surrogate.contains val
is

  # the utf8 encoded bytes for the string representation
  # of this codepoint
  #
  public redef utf8 Sequence u8 =>
    if      codepoint.utf8_encoded_in_one_byte   .contains val then [ val.low8bits ]
    else if codepoint.utf8_encoded_in_two_bytes  .contains val then [ (u32 0xc0 | (val >>  6) & 0x1f).low8bits,
                                                                           (u32 0x80 |  val        & 0x3f).low8bits ]
    else if codepoint.utf8_encoded_in_three_bytes.contains val then [ (u32 0xe0 | (val >> 12) & 0x1f).low8bits,
                                                                           (u32 0x80 | (val >>  6) & 0x3f).low8bits,
                                                                           (u32 0x80 |  val        & 0x3f).low8bits ]
    else if codepoint.utf8_encoded_in_four_bytes .contains val then [ (u32 0xf0 | (val >> 18) & 0x07).low8bits,
                                                                           (u32 0x80 | (val >> 12) & 0x3f).low8bits,
                                                                           (u32 0x80 | (val >>  6) & 0x3f).low8bits,
                                                                           (u32 0x80 |  val        & 0x3f).low8bits ]
    else
      fuzion.std.panic "failed to encode code point {codepoint.this}"


  # is this an ASCII code encoded in one byte
  #
  public redef is_ascii => codepoint.ascii.contains val


  # range of permitted value for a codepoint
  #
  public type.range => u32 0 .. 0x10ffff


  # range of values encoded in one byte
  #
  public type.ascii => u32 0 .. 0x7f

  # 0 to 9
  public type.ascii_digit => u32 0x30 .. 0x39

  # A to Z (uppercase only)
  public type.A_to_Z => u32 0x41 .. 0x5A

  # a to z (lowercase only)
  public type.a_to_z => u32 0x61 .. 0x7A

  # a-z and A-Z
  # https://en.wikipedia.org/wiki/ISO_basic_Latin_alphabet
  public type.latin_alphabet => codepoint.A_to_Z.concat_sequences a_to_z # NYI It would be better to use union of those two sets

  # range of values encoded in one byte
  #
  public type.utf8_encoded_in_one_byte => ascii


  # range of values encoded in two bytes
  #
  public type.utf8_encoded_in_two_bytes => u32 0x80 .. 0x7ff


  # range of values encoded in three bytes
  #
  public type.utf8_encoded_in_three_bytes => u32 0x800 .. 0xffff


  # range of values encoded in four bytes
  #
  public type.utf8_encoded_in_four_bytes => u32 0x10000 .. 0x10ffff


  # range reserved for utf16 surrogate pairs
  #
  public type.utf16_surrogate => u32 0xd800 .. 0xdfff


  # values guaranteed never to be a legal unicode character
  #
  public type.not_a_character => u32 0xfffe .. 0xffff


  # return the number of bytes of this utf-8 character
  # by examining the first byte
  #
  # NYI: implement num_utf8_bytes(first_byte u8) => (~first_byte).leading_zeroes+1.
  module type.num_utf8_bytes (first_byte u8) outcome i32 =>
    if ((u8 0)..0x7F).contains first_byte
      1
    else if ((u8 0xC2)..0xDF).contains first_byte
      2
    else if ((u8 0xE0)..0xEF).contains first_byte
      3
    else if ((u8 0xF0)..0xF4).contains first_byte
      4
    else
      error "first byte is not the start of utf-8 character."


  # compare two codepoints for equality
  #
  # result is true iff the codepoints have the same value
  #
  fixed type.equality(a, b codepoint) => a.val = b.val


  # compare two codepoints
  #
  # This defines a total order over strings that is unrelated to alphabetic order.
  #
  fixed type.lteq(a, b codepoint) => a.val <= b.val
next: composition.fz