Improve script generating unaccent rules

author Teodor Sigaev

Wed, 16 Mar 2016 13:47:03 +0000 (16:47 +0300)

committer Teodor Sigaev

Wed, 16 Mar 2016 13:47:03 +0000 (16:47 +0300)
author Teodor Sigaev
Wed, 16 Mar 2016 13:47:03 +0000 (16:47 +0300)
committer Teodor Sigaev
Wed, 16 Mar 2016 13:47:03 +0000 (16:47 +0300)
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py

index b838d8f630d82b35178c651619946591f5ae6089..2f5520c81981597fcc70d7cc8d9b2dc013ab9310 100644 (file)
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -1,20 +1,33 @@
-#!/usr/bin/python
+#!/usr/bin/python2
+# -*- coding: utf-8 -*-
  #
  # This script builds unaccent.rules on standard output when given the
-# contents of UnicodeData.txt[1] on standard input.  Optionally includes
-# ligature expansion, if --expand-ligatures is given on the command line.
+# contents of UnicodeData.txt [1] and Latin-ASCII.xml [2] given as
+# arguments. Optionally includes ligature expansion and Unicode CLDR
+# Latin-ASCII transliterator, enabled by default, this can be disabled
+# with "--no-ligatures-expansion" command line option.
  #
  # The approach is to use the Unicode decomposition data to identify
  # precomposed codepoints that are equivalent to a ligature of several
  # letters, or a base letter with any number of diacritical marks.
-# There is also a small set of special cases for codepoints that we
-# traditionally support even though Unicode doesn't consider them to
-# be ligatures or letters with marks.
  #
-# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
+# This approach handles most letters with diacritical marks and some
+# ligatures.  However, several characters (notably a majority of
+# ligatures) don't have decomposition. To handle all these cases, one can
+# use a standard Unicode transliterator available in Common Locale Data
+# Repository (CLDR): Latin-ASCII.  This transliterator associates Unicode
+# characters to ASCII-range equivalent.  Unless "--no-ligatures-expansion"
+# option is enabled, the XML file of this transliterator [2] -- given as a
+# command line argument -- will be parsed and used.
+#
+# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
+# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
+
  
  import re
+import argparse
  import sys
+import xml.etree.ElementTree as ET
  
  def print_record(codepoint, letter):
      print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
@@ -63,15 +76,73 @@ def get_plain_letters(codepoint, table):
      assert(is_ligature(codepoint, table))
      return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
  
-def main(expand_ligatures):
+def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
+    """Parse the XML file and return a set of tuples (src, trg), where "src"
+    is the original character and "trg" the substitute."""
+    charactersSet = set()
+
+    # RegEx to parse rules
+    rulePattern = re.compile(ur'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
+
+    # construct tree from XML
+    transliterationTree = ET.parse(latinAsciiFilePath)
+    transliterationTreeRoot = transliterationTree.getroot()
+
+    for rule in transliterationTreeRoot.findall("./transforms/transform/tRule"):
+        matches = rulePattern.search(rule.text)
+
+        # The regular expression capture four groups corresponding
+        # to the characters.
+        #
+        # Group 1: plain "src" char. Empty if group 2 is not.
+        # Group 2: unicode-espaced "src" char (e.g. "\u0110"). Empty if group 1 is not.
+        #
+        # Group 3: plain "trg" char. Empty if group 4 is not.
+        # Group 4: plain "trg" char between quotes. Empty if group 3 is not.
+        if matches is not None:
+            src = matches.group(1) if matches.group(1) is not None else matches.group(2).decode('unicode-escape')
+            trg = matches.group(3) if matches.group(3) is not None else matches.group(4)
+
+            # "'" and """ are escaped
+            trg = trg.replace("\\'", "'").replace('\\"', '"')
+
+            # the parser of unaccent only accepts non-whitespace characters
+            # for "src" and "trg" (see unaccent.c)
+            if not src.isspace() and not trg.isspace():
+                charactersSet.add((ord(src), trg))
+
+    return charactersSet
+
+def special_cases():
+    """Returns the special cases which are not handled by other methods"""
+    charactersSet = set()
+
+    # Cyrillic
+    charactersSet.add((0x0401, u"\u0415")) # CYRILLIC CAPITAL LETTER IO
+    charactersSet.add((0x0451, u"\u0435")) # CYRILLIC SMALL LETTER IO
+
+    # Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
+    charactersSet.add((0x2103, u"\xb0C")) # DEGREE CELSIUS
+    charactersSet.add((0x2109, u"\xb0F")) # DEGREE FAHRENHEIT
+    charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT
+
+    return charactersSet
+
+def main(args):
      # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
      decomposition_type_pattern = re.compile(" *<[^>]*> *")
  
      table = {}
      all = []
  
+    # unordered set for ensure uniqueness
+    charactersSet = set()
+
+    # read file UnicodeData.txt
+    unicodeDataFile = open(args.unicodeDataFilePath, 'r')
+
      # read everything we need into memory
-    for line in sys.stdin.readlines():
+    for line in unicodeDataFile:
          fields = line.split(";")
          if len(fields) > 5:
              # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
@@ -89,35 +160,34 @@ def main(expand_ligatures):
          if codepoint.general_category.startswith('L') and \
             len(codepoint.combining_ids) > 1:
              if is_letter_with_marks(codepoint, table):
-                print_record(codepoint.id,
-                             chr(get_plain_letter(codepoint, table).id))
-            elif expand_ligatures and is_ligature(codepoint, table):
-                print_record(codepoint.id,
+                charactersSet.add((codepoint.id,
+                             chr(get_plain_letter(codepoint, table).id)))
+            elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
+                charactersSet.add((codepoint.id,
                               "".join(unichr(combining_codepoint.id)
                                       for combining_codepoint \
-                                     in get_plain_letters(codepoint, table)))
-
-    # some special cases
-    print_record(0x00d8, "O") # LATIN CAPITAL LETTER O WITH STROKE
-    print_record(0x00f8, "o") # LATIN SMALL LETTER O WITH STROKE
-    print_record(0x0110, "D") # LATIN CAPITAL LETTER D WITH STROKE
-    print_record(0x0111, "d") # LATIN SMALL LETTER D WITH STROKE
-    print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I
-    print_record(0x0126, "H") # LATIN CAPITAL LETTER H WITH STROKE
-    print_record(0x0127, "h") # LATIN SMALL LETTER H WITH STROKE
-    print_record(0x0141, "L") # LATIN CAPITAL LETTER L WITH STROKE
-    print_record(0x0142, "l") # LATIN SMALL LETTER L WITH STROKE
-    print_record(0x0149, "'n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
-    print_record(0x0166, "T") # LATIN CAPITAL LETTER T WITH STROKE
-    print_record(0x0167, "t") # LATIN SMALL LETTER t WITH STROKE
-    print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO
-    print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO
-    if expand_ligatures:
-        print_record(0x00c6, "AE") # LATIN CAPITAL LETTER AE
-        print_record(0x00df, "ss") # LATIN SMALL LETTER SHARP S
-        print_record(0x00e6, "ae") # LATIN SMALL LETTER AE
-        print_record(0x0152, "OE") # LATIN CAPITAL LIGATURE OE
-        print_record(0x0153, "oe") # LATIN SMALL LIGATURE OE
+                                     in get_plain_letters(codepoint, table))))
+
+    # add CLDR Latin-ASCII characters
+    if not args.noLigaturesExpansion:
+        charactersSet |= parse_cldr_latin_ascii_transliterator(args.latinAsciiFilePath)
+        charactersSet |= special_cases()
+
+    # sort for more convenient display
+    charactersList = sorted(charactersSet, key=lambda characterPair: characterPair[0])
+
+    for characterPair in charactersList:
+        print_record(characterPair[0], characterPair[1])
  
  if __name__ == "__main__":
-    main(len(sys.argv) == 2 and sys.argv[1] == "--expand-ligatures")
+    parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
+    parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt. See .", type=str, required=True, dest='unicodeDataFilePath')
+    parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See .", type=str, dest='latinAsciiFilePath')
+    parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion')
+    args = parser.parse_args()
+
+    if args.noLigaturesExpansion is False and args.latinAsciiFilePath is None:
+        sys.stderr.write('You must specify the path to Latin-ASCII transliterator file with \"--latin-ascii-file\" option or use \"--no-ligatures-expansion\" option. Use \"-h\" option for help.')
+        sys.exit(1)
+
+    main(args)
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules

index 73c24a188badf9dfcbf5ab3950841232014e3c22..84886da587aa8a0b4fd8e58fbc43a727f526b9b2 100644 (file)
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -1,9 +1,18 @@
+© (C)
+« <<
+ -
+® (R)
+» >>
+¼  1/4
+½  1/2
+¾  3/4
  À A
  Á A
  Â A
  Ã A
  Ä A
  Å A
+Æ AE
  Ç C
  È E
  É E
@@ -13,23 +22,29 @@
  Í I
  Î I
  Ï I
+Ð D
  Ñ N
  Ò O
  Ó O
  Ô O
  Õ O
  Ö O
+× *
+Ø O
  Ù U
  Ú U
  Û U
  Ü U
  Ý Y
+Þ TH
+ß ss
  à a
  á a
  â a
  ã a
  ä a
  å a
+æ ae
  ç c
  è e
  é e
@@ -39,17 +54,21 @@
  í i
  î i
  ï i
+ð d
  ñ n
  ò o
  ó o
  ô o
  õ o
  ö o
+÷ /
+ø o
  ù u
  ú u
  û u
  ü u
  ý y
+þ th
  ÿ y
  Ā A
  ā a
@@ -67,6 +86,8 @@
  č c
  Ď D
  ď d
+Đ D
+đ d
  Ē E
  ē e
  Ĕ E
@@ -87,6 +108,8 @@
  ģ g
  Ĥ H
  ĥ h
+Ħ H
+ħ h
  Ĩ I
  ĩ i
  Ī I
@@ -96,30 +119,41 @@
  Į I
  į i
  İ I
+ı i
  Ĳ IJ
  ĳ ij
  Ĵ J
  ĵ j
  Ķ K
  ķ k
+ĸ q
  Ĺ L
  ĺ l
  Ļ L
  ļ l
  Ľ L
  ľ l
+Ŀ L
+ŀ l
+Ł L
+ł l
  Ń N
  ń n
  Ņ N
  ņ n
  Ň N
  ň n
+ŉ 'n
+Ŋ N
+ŋ n
  Ō O
  ō o
  Ŏ O
  ŏ o
  Ő O
  ő o
+Œ OE
+œ oe
  Ŕ R
  ŕ r
  Ŗ R
@@ -138,6 +172,8 @@
  ţ t
  Ť T
  ť t
+Ŧ T
+ŧ t
  Ũ U
  ũ u
  Ū U
@@ -161,10 +197,46 @@
  ż z
  Ž Z
  ž z
+ſ s
+ƀ b
+Ɓ B
+Ƃ B
+ƃ b
+Ƈ C
+ƈ c
+Ɖ D
+Ɗ D
+Ƌ D
+ƌ d
+Ɛ E
+Ƒ F
+ƒ f
+Ɠ G
+ƕ hv
+Ɩ I
+Ɨ I
+Ƙ K
+ƙ k
+ƚ l
+Ɲ N
+ƞ n
  Ơ O
  ơ o
+Ƣ OI
+ƣ oi
+Ƥ P
+ƥ p
+ƫ t
+Ƭ T
+ƭ t
+Ʈ T
  Ư U
  ư u
+Ʋ V
+Ƴ Y
+ƴ y
+Ƶ Z
+ƶ z
  Ǆ DZ
  ǅ Dz
  ǆ dz
@@ -182,6 +254,8 @@
  ǒ o
  Ǔ U
  ǔ u
+Ǥ G
+ǥ g
  Ǧ G
  ǧ g
  Ǩ K
@@ -226,6 +300,9 @@
  ț t
  Ȟ H
  ȟ h
+ȡ d
+Ȥ Z
+ȥ z
  Ȧ A
  ȧ a
  Ȩ E
@@ -234,6 +311,128 @@
  ȯ o
  Ȳ Y
  ȳ y
+ȴ l
+ȵ n
+ȶ t
+ȷ j
+ȸ db
+ȹ qp
+Ⱥ A
+Ȼ C
+ȼ c
+Ƚ L
+Ⱦ T
+ȿ s
+ɀ z
+Ƀ B
+Ʉ U
+Ɇ E
+ɇ e
+Ɉ J
+ɉ j
+Ɍ R
+ɍ r
+Ɏ Y
+ɏ y
+ɓ b
+ɕ c
+ɖ d
+ɗ d
+ɛ e
+ɟ j
+ɠ g
+ɡ g
+ɢ G
+ɦ h
+ɧ h
+ɨ i
+ɪ I
+ɫ l
+ɬ l
+ɭ l
+ɱ m
+ɲ n
+ɳ n
+ɴ N
+ɶ OE
+ɼ r
+ɽ r
+ɾ r
+ʀ R
+ʂ s
+ʈ t
+ʉ u
+ʋ v
+ʏ Y
+ʐ z
+ʑ z
+ʙ B
+ʛ G
+ʜ H
+ʝ j
+ʟ L
+ʠ q
+ʣ dz
+ʥ dz
+ʦ ts
+ʪ ls
+ʫ lz
+Ё Е
+ё е
+ᴀ    A
+ᴁ    AE
+ᴃ    B
+ᴄ    C
+ᴅ    D
+ᴆ    D
+ᴇ    E
+ᴊ    J
+ᴋ    K
+ᴌ    L
+ᴍ    M
+ᴏ    O
+ᴘ    P
+ᴛ    T
+ᴜ    U
+ᴠ    V
+ᴡ    W
+ᴢ    Z
+ᵫ    ue
+ᵬ    b
+ᵭ    d
+ᵮ    f
+ᵯ    m
+ᵰ    n
+ᵱ    p
+ᵲ    r
+ᵳ    r
+ᵴ    s
+ᵵ    t
+ᵶ    z
+ᵺ    th
+ᵻ    I
+ᵽ    p
+ᵾ    U
+ᶀ    b
+ᶁ    d
+ᶂ    f
+ᶃ    g
+ᶄ    k
+ᶅ    l
+ᶆ    m
+ᶇ    n
+ᶈ    p
+ᶉ    r
+ᶊ    s
+ᶌ    v
+ᶍ    x
+ᶎ    z
+ᶏ    a
+ᶑ    d
+ᶒ    e
+ᶓ    e
+ᶖ    i
+ᶙ    u
  Ḁ    A
  ḁ    a
  Ḃ    B
@@ -356,6 +555,10 @@
  ẗ    t
  ẘ    w
  ẙ    y
+ẚ    a
+ẜ    s
+ẝ    s
+ẞ    SS
  Ạ    A
  ạ    a
  Ả    A
@@ -386,28 +589,461 @@
  ỷ    y
  Ỹ    Y
  ỹ    y
+Ỻ    LL
+ỻ    ll
+Ỽ    V
+ỽ    v
+Ỿ    Y
+ỿ    y
+‐    -
+‑    -
+‒    -
+–    -
+—    -
+―    -
+‖    ||
+‘    '
+’    '
+‚    ,
+‛    '
+“    "
+”    "
+„    ,,
+‟    "
+․    .
+‥    ..
+…    ...
+′    '
+″    "
+‹    <
+›    >
+‼    !!
+⁄    /
+⁅    [
+⁆    ]
+⁇    ??
+⁈    ?!
+⁉    !?
+⁎    *
+₠    CE
+₢    Cr
+₣    Fr.
+₤    L.
+₧    Pts
+₹    Rs
+₺    TL
+℀    a/c
+℁    a/s
+ℂ    C
+℃    °C
+℅    c/o
+℆    c/u
+℉    °F
+ℊ    g
+ℋ    H
+ℌ    x
+ℍ    H
+ℎ    h
+ℐ    I
+ℑ    I
+ℒ    L
+ℓ    l
+ℕ    N
+№    No
+℗    (P)
+ℙ    P
+ℚ    Q
+ℛ    R
+ℜ    R
+ℝ    R
+℞    Rx
+℡    TEL
+ℤ    Z
+ℨ    Z
+ℬ    B
+ℭ    C
+ℯ    e
+ℰ    E
+ℱ    F
+ℳ    M
+ℴ    o
+ℹ    i
+℻    FAX
+ⅅ    D
+ⅆ    d
+ⅇ    e
+ⅈ    i
+ⅉ    j
+⅓     1/3
+⅔     2/3
+⅕     1/5
+⅖     2/5
+⅗     3/5
+⅘     4/5
+⅙     1/6
+⅚     5/6
+⅛     1/8
+⅜     3/8
+⅝     5/8
+⅞     7/8
+⅟     1/
+Ⅰ    I
+Ⅱ    II
+Ⅲ    III
+Ⅳ    IV
+Ⅴ    V
+Ⅵ    VI
+Ⅶ    VII
+Ⅷ    VIII
+Ⅸ    IX
+Ⅹ    X
+Ⅺ    XI
+Ⅻ    XII
+Ⅼ    L
+Ⅽ    C
+Ⅾ    D
+Ⅿ    M
+ⅰ    i
+ⅱ    ii
+ⅲ    iii
+ⅳ    iv
+ⅴ    v
+ⅵ    vi
+ⅶ    vii
+ⅷ    viii
+ⅸ    ix
+ⅹ    x
+ⅺ    xi
+ⅻ    xii
+ⅼ    l
+ⅽ    c
+ⅾ    d
+ⅿ    m
+−    -
+∕    /
+∖    \
+∣    |
+∥    ||
+≪    <<
+≫    >>
+⑴    (1)
+⑵    (2)
+⑶    (3)
+⑷    (4)
+⑸    (5)
+⑹    (6)
+⑺    (7)
+⑻    (8)
+⑼    (9)
+⑽    (10)
+⑾    (11)
+⑿    (12)
+⒀    (13)
+⒁    (14)
+⒂    (15)
+⒃    (16)
+⒄    (17)
+⒅    (18)
+⒆    (19)
+⒇    (20)
+⒈    1.
+⒉    2.
+⒊    3.
+⒋    4.
+⒌    5.
+⒍    6.
+⒎    7.
+⒏    8.
+⒐    9.
+⒑    10.
+⒒    11.
+⒓    12.
+⒔    13.
+⒕    14.
+⒖    15.
+⒗    16.
+⒘    17.
+⒙    18.
+⒚    19.
+⒛    20.
+⒜    (a)
+⒝    (b)
+⒞    (c)
+⒟    (d)
+⒠    (e)
+⒡    (f)
+⒢    (g)
+⒣    (h)
+⒤    (i)
+⒥    (j)
+⒦    (k)
+⒧    (l)
+⒨    (m)
+⒩    (n)
+⒪    (o)
+⒫    (p)
+⒬    (q)
+⒭    (r)
+⒮    (s)
+⒯    (t)
+⒰    (u)
+⒱    (v)
+⒲    (w)
+⒳    (x)
+⒴    (y)
+⒵    (z)
+⦅    ((
+⦆    ))
+⩴    ::=
+⩵    ==
+⩶    ===
+、    ,
+。    .
+〇    0
+〈    <
+〉    >
+《    <<
+》    >>
+〔    [
+〕    ]
+〘    [
+〙    ]
+〚    [
+〛    ]
+〝    "
+〞    "
+㍱    hPa
+㍲    da
+㍳    AU
+㍴    bar
+㍵    oV
+㍶    pc
+㍷    dm
+㍺    IU
+㎀    pA
+㎁    nA
+㎃    mA
+㎄    kA
+㎅    KB
+㎆    MB
+㎇    GB
+㎈    cal
+㎉    kcal
+㎊    pF
+㎋    nF
+㎎    mg
+㎏    kg
+㎐    Hz
+㎑    kHz
+㎒    MHz
+㎓    GHz
+㎔    THz
+㎙    fm
+㎚    nm
+㎜    mm
+㎝    cm
+㎞    km
+㎧    m/s
+㎩    Pa
+㎪    kPa
+㎫    MPa
+㎬    GPa
+㎭    rad
+㎮    rad/s
+㎰    ps
+㎱    ns
+㎳    ms
+㎴    pV
+㎵    nV
+㎷    mV
+㎸    kV
+㎹    MV
+㎺    pW
+㎻    nW
+㎽    mW
+㎾    kW
+㎿    MW
+㏂    a.m.
+㏃    Bq
+㏄    cc
+㏅    cd
+㏆    C/kg
+㏇    Co.
+㏈    dB
+㏉    Gy
+㏊    ha
+㏋    HP
+㏌    in
+㏍    KK
+㏎    KM
+㏏    kt
+㏐    lm
+㏑    ln
+㏒    log
+㏓    lx
+㏔    mb
+㏕    mil
+㏖    mol
+㏗    pH
+㏘    p.m.
+㏙    PPM
+㏚    PR
+㏛    sr
+㏜    Sv
+㏝    Wb
+㏞    V/m
+㏟    A/m
  ﬀ    ff
  ﬁ    fi
  ﬂ    fl
  ﬃ    ffi
  ﬄ    ffl
+ﬅ    st
  ﬆ    st
-Ø O
-ø o
-Đ D
-đ d
-ı i
-Ħ H
-ħ h
-Ł L
-ł l
-ŉ 'n
-Ŧ T
-ŧ t
-Ё Е
-ё е
-Æ AE
-ß ss
-æ ae
-Œ OE
-œ oe
+︐    ,
+︑    ,
+︒    .
+︓    :
+︔    ;
+︕    !
+︖    ?
+︙    ...
+︰    ..
+︱    -
+︲    -
+︵    (
+︶    )
+︷    {
+︸    }
+︹    [
+︺    ]
+︽    <<
+︾    >>
+︿    <
+﹀    >
+﹇    [
+﹈    ]
+﹐    ,
+﹑    ,
+﹒    .
+﹔    ;
+﹕    :
+﹖    ?
+﹗    !
+﹘    -
+﹙    (
+﹚    )
+﹛    {
+﹜    }
+﹝    [
+﹞    ]
+﹟    #
+﹠    &
+﹡    *
+﹢    +
+﹣    -
+﹤    <
+﹥    >
+﹦    =
+﹨    \
+﹩    $
+﹪    %
+﹫    @
+！    !
+＂    "
+＃    #
+＄    $
+％    %
+＆    &
+＇    '
+（    (
+）    )
+＊    *
+＋    +
+，    ,
+－    -
+．    .
+／    /
+０    0
+１    1
+２    2
+３    3
+４    4
+５    5
+６    6
+７    7
+８    8
+９    9
+：    :
+；    ;
+＜    <
+＝    =
+＞    >
+？    ?
+＠    @
+Ａ    A
+Ｂ    B
+Ｃ    C
+Ｄ    D
+Ｅ    E
+Ｆ    F
+Ｇ    G
+Ｈ    H
+Ｉ    I
+Ｊ    J
+Ｋ    K
+Ｌ    L
+Ｍ    M
+Ｎ    N
+Ｏ    O
+Ｐ    P
+Ｑ    Q
+Ｒ    R
+Ｓ    S
+Ｔ    T
+Ｕ    U
+Ｖ    V
+Ｗ    W
+Ｘ    X
+Ｙ    Y
+Ｚ    Z
+［    [
+＼    \
+］    ]
+＾    ^
+＿    _
+｀    `
+ａ    a
+ｂ    b
+ｃ    c
+ｄ    d
+ｅ    e
+ｆ    f
+ｇ    g
+ｈ    h
+ｉ    i
+ｊ    j
+ｋ    k
+ｌ    l
+ｍ    m
+ｎ    n
+ｏ    o
+ｐ    p
+ｑ    q
+ｒ    r
+ｓ    s
+ｔ    t
+ｕ    u
+ｖ    v
+ｗ    w
+ｘ    x
+ｙ    y
+ｚ    z
+｛    {
+｜    |
+｝    }
+～    ~
+｟    ((
+｠    ))
+｡    .
+､    ,
author	Teodor Sigaev
	Wed, 16 Mar 2016 13:47:03 +0000 (16:47 +0300)
committer	Teodor Sigaev
	Wed, 16 Mar 2016 13:47:03 +0000 (16:47 +0300)
contrib/unaccent/generate_unaccent_rules.py		patch \| blob \| blame \| history
contrib/unaccent/unaccent.rules		patch \| blob \| blame \| history