#! /usr/bin/env python
# -*- coding: utf-8 -*_
# Author: Liu Yang <mkliuyang@gmail.com>
"""
使用方法：

``` python

from dlab.utils.charset import charset_dictionary
assert '中' in charset_dictionary['CJK']


```

"""
__all__ = ['CharSet', 'charset_dictionary']


class AbstractCharset(object):
    name = ''
    description = ''

    def number_2_str(self, number):
        return str(bytes([number]), encoding='utf8')

    def get_re_str(self):
        raise NotImplementedError()

    def get_num_str(self):
        raise NotImplementedError()

    def __contains__(self, item):
        raise NotImplementedError()

    def __str__(self):
        return '<CharSet:%s (%s) [%s]>' % (self.name, self.description, self.get_num_str())

    def tree(self, deep=0):
        raise NotImplementedError()


class SimiCharSet(AbstractCharset):
    def __init__(self, start_code, end_code, name, description=''):
        self.name = name
        self.start_code = start_code
        self.end_code = end_code
        self.description = description

    def __contains__(self, item):
        if not len(item) == 1:
            raise ValueError('argument must be a character.')
        return self.start_code <= ord(item) <= self.end_code

    def get_scope(self):
        return self.start_code, self.end_code

    def get_re_str(self):
        return '%s-%s' % (chr(self.start_code), chr(self.end_code))

    def get_num_str(self):
        return '\\u%04X-\\u%04X' % (self.start_code, self.end_code)

    def tree(self, deep=0):
        return '%s%s\n' % ('\t' * deep, str(self))


class CharSet(AbstractCharset):
    def __init__(self, charsets, name, description=''):
        self.name = name
        self.charsets = charsets
        self.description = description

    def get_re_str(self):
        return ''.join(charset.get_re_str() for charset in self.charsets)

    def get_num_str(self):
        return ''.join(charset.get_num_str() for charset in self.charsets)

    def __contains__(self, item):
        if not len(item) == 1:
            raise ValueError('argument must be a character.')
        return any(charset.__contains__(item) for charset in self.charsets)

    def tree(self, deep=0):
        return '%s%s\n%s' % ('\t' * deep, str(self), ''.join([set.tree(deep + 1) for set in self.charsets]))


charset_dictionary = {}
charset_dictionary.update({
    "Basic Latin":	SimiCharSet(0x0020, 0x007F, "Basic Latin", "基础拉丁语"),
    "Latin-1 Supplement":	SimiCharSet(0x00A0, 0x00FF, "Latin-1 Supplement", "Latin-1补充"),
    "Latin Extended-A":	SimiCharSet(0x0100, 0x017F, "Latin Extended-A", "拉丁语扩展-A"),
    "Latin Extended-B":	SimiCharSet(0x0180, 0x024F, "Latin Extended-B", "拉丁语扩展-B"),
    "IPA Extensions":	SimiCharSet(0x0250, 0x02AF, "IPA Extensions", "IPA扩展"),
    "Spacing Modifier Letters":	SimiCharSet(0x02B0, 0x02FF, "Spacing Modifier Letters", "间距修改字母"),
    "Combining Diacritical Marks":	SimiCharSet(0x0300, 0x036F, "Combining Diacritical Marks", "结合变音符号"),
    "Greek and Coptic":	SimiCharSet(0x0370, 0x03FF, "Greek and Coptic", "希腊语和科普特语"),
    "Cyrillic":	SimiCharSet(0x0400, 0x04FF, "Cyrillic", "西里尔"),
    "Cyrillic Supplementary":	SimiCharSet(0x0500, 0x052F, "Cyrillic Supplementary", "西里尔文补充"),
    "Armenian":	SimiCharSet(0x0530, 0x058F, "Armenian", "亚美尼亚"),
    "Hebrew":	SimiCharSet(0x0590, 0x05FF, "Hebrew", "希伯来语"),
    "Arabic":	SimiCharSet(0x0600, 0x06FF, "Arabic", "阿拉伯"),
    "Syriac":	SimiCharSet(0x0700, 0x074F, "Syriac", "叙利亚"),
    "Thaana":	SimiCharSet(0x0780, 0x07BF, "Thaana", "塔纳文"),
    "Devanagari":	SimiCharSet(0x0900, 0x097F, "Devanagari", "梵文"),
    "Bengali":	SimiCharSet(0x0980, 0x09FF, "Bengali", "孟加拉"),
    "Gurmukhi":	SimiCharSet(0x0A00, 0x0A7F, "Gurmukhi", "古尔穆基"),
    "Gujarati":	SimiCharSet(0x0A80, 0x0AFF, "Gujarati", "古吉拉特语"),
    "Oriya":	SimiCharSet(0x0B00, 0x0B7F, "Oriya", "奥里亚语"),
    "Tamil":	SimiCharSet(0x0B80, 0x0BFF, "Tamil", "泰米尔人"),
    "Telugu":	SimiCharSet(0x0C00, 0x0C7F, "Telugu", "泰卢固语"),
    "Kannada":	SimiCharSet(0x0C80, 0x0CFF, "Kannada", "卡纳达语"),
    "Malayalam":	SimiCharSet(0x0D00, 0x0D7F, "Malayalam", "马拉雅拉姆语"),
    "Sinhala":	SimiCharSet(0x0D80, 0x0DFF, "Sinhala", "僧伽罗语"),
    "Thai":	SimiCharSet(0x0E00, 0x0E7F, "Thai", "泰国"),
    "Lao":	SimiCharSet(0x0E80, 0x0EFF, "Lao", "老挝"),
    "Tibetan":	SimiCharSet(0x0F00, 0x0FFF, "Tibetan", "藏"),
    "Myanmar":	SimiCharSet(0x1000, 0x109F, "Myanmar", "缅甸"),
    "Georgian":	SimiCharSet(0x10A0, 0x10FF, "Georgian", "格鲁吉亚"),
    "Hangul Jamo":	SimiCharSet(0x1100, 0x11FF, "Hangul Jamo", "Hangul Jamo"),
    "Ethiopic":	SimiCharSet(0x1200, 0x137F, "Ethiopic", "衣索比亚"),
    "Cherokee":	SimiCharSet(0x13A0, 0x13FF, "Cherokee", "切诺基"),
    "Unified Canadian Aboriginal Syllabics":	SimiCharSet(0x1400, 0x167F, "Unified Canadian Aboriginal Syllabics", "统一的加拿大原住民音节"),
    "Ogham":	SimiCharSet(0x1680, 0x169F, "Ogham", "欧甘文"),
    "Runic":	SimiCharSet(0x16A0, 0x16FF, "Runic", "符文"),
    "Tagalog":	SimiCharSet(0x1700, 0x171F, "Tagalog", "他加禄语"),
    "Hanunoo":	SimiCharSet(0x1720, 0x173F, "Hanunoo", "哈努诺文"),
    "Buhid":	SimiCharSet(0x1740, 0x175F, "Buhid", "布迪文"),
    "Tagbanwa":	SimiCharSet(0x1760, 0x177F, "Tagbanwa", "塔格巴努亚文"),
    "Khmer":	SimiCharSet(0x1780, 0x17FF, "Khmer", "高棉"),
    "Mongolian":	SimiCharSet(0x1800, 0x18AF, "Mongolian", "蒙"),
    "Limbu":	SimiCharSet(0x1900, 0x194F, "Limbu", "林布"),
    "Tai Le":	SimiCharSet(0x1950, 0x197F, "Tai Le", "泰乐"),
    "Khmer Symbols":	SimiCharSet(0x19E0, 0x19FF, "Khmer Symbols", "高棉符号"),
    "Phonetic Extensions":	SimiCharSet(0x1D00, 0x1D7F, "Phonetic Extensions", "语音扩展"),
    "Latin Extended Additional":	SimiCharSet(0x1E00, 0x1EFF, "Latin Extended Additional", "拉丁语扩展附加"),
    "Greek Extended":	SimiCharSet(0x1F00, 0x1FFF, "Greek Extended", "希腊语扩展"),
    "General Punctuation":	SimiCharSet(0x2000, 0x206F, "General Punctuation", "一般标点符号"),
    "Superscripts and Subscripts":	SimiCharSet(0x2070, 0x209F, "Superscripts and Subscripts", "上标和下标"),
    "Currency Symbols":	SimiCharSet(0x20A0, 0x20CF, "Currency Symbols", "货币符号"),
    "Combining Diacritical Marks for Symbols":	SimiCharSet(0x20D0, 0x20FF, "Combining Diacritical Marks for Symbols", "结合符号的变音符号"),
    "Letterlike Symbols":	SimiCharSet(0x2100, 0x214F, "Letterlike Symbols", "字母符号"),
    "Number Forms":	SimiCharSet(0x2150, 0x218F, "Number Forms", "数字表格"),
    "Arrows":	SimiCharSet(0x2190, 0x21FF, "Arrows", "箭头"),
    "Mathematical Operators":	SimiCharSet(0x2200, 0x22FF, "Mathematical Operators", "数学运算符"),
    "Miscellaneous Technical":	SimiCharSet(0x2300, 0x23FF, "Miscellaneous Technical", "其他技术"),
    "Control Pictures":	SimiCharSet(0x2400, 0x243F, "Control Pictures", "控制图片"),
    "Optical Character Recognition":	SimiCharSet(0x2440, 0x245F, "Optical Character Recognition", "光学字符识别"),
    "Enclosed Alphanumerics":	SimiCharSet(0x2460, 0x24FF, "Enclosed Alphanumerics", "封闭的字母数字"),
    "Box Drawing":	SimiCharSet(0x2500, 0x257F, "Box Drawing", "框图"),
    "Block Elements":	SimiCharSet(0x2580, 0x259F, "Block Elements", "块元素"),
    "Geometric Shapes":	SimiCharSet(0x25A0, 0x25FF, "Geometric Shapes", "几何形状"),
    "Miscellaneous Symbols":	SimiCharSet(0x2600, 0x26FF, "Miscellaneous Symbols", "杂项符号"),
    "Dingbats":	SimiCharSet(0x2700, 0x27BF, "Dingbats", "装饰符号"),
    "Miscellaneous Mathematical Symbols-A":	SimiCharSet(0x27C0, 0x27EF, "Miscellaneous Mathematical Symbols-A", "杂项数学符号-A"),
    "Supplemental Arrows-A":	SimiCharSet(0x27F0, 0x27FF, "Supplemental Arrows-A", "补充箭头-A"),
    "Braille Patterns":	SimiCharSet(0x2800, 0x28FF, "Braille Patterns", "盲文模式"),
    "Supplemental Arrows-B":	SimiCharSet(0x2900, 0x297F, "Supplemental Arrows-B", "补充箭头-B"),
    "Miscellaneous Mathematical Symbols-B":	SimiCharSet(0x2980, 0x29FF, "Miscellaneous Mathematical Symbols-B", "杂项数学符号-B"),
    "Supplemental Mathematical Operators":	SimiCharSet(0x2A00, 0x2AFF, "Supplemental Mathematical Operators", "补充数学运算符"),
    "Miscellaneous Symbols and Arrows":	SimiCharSet(0x2B00, 0x2BFF, "Miscellaneous Symbols and Arrows", "杂项符号和箭头"),
    "CJK Radicals Supplement":	SimiCharSet(0x2E80, 0x2EFF, "CJK Radicals Supplement", "CJK Radicals Supplement"),
    "Kangxi Radicals":	SimiCharSet(0x2F00, 0x2FDF, "Kangxi Radicals", "康熙激进派"),
    "Ideographic Description Characters":	SimiCharSet(0x2FF0, 0x2FFF, "Ideographic Description Characters", "表意文字描述字符"),
    "CJK Symbols and Punctuation":	SimiCharSet(0x3000, 0x303F, "CJK Symbols and Punctuation", "CJK符号和标点符号"),
    "Hiragana":	SimiCharSet(0x3040, 0x309F, "Hiragana", "平假名"),
    "Katakana":	SimiCharSet(0x30A0, 0x30FF, "Katakana", "片假名"),
    "Bopomofo":	SimiCharSet(0x3100, 0x312F, "Bopomofo", "汉语拼音"),
    "Hangul Compatibility Jamo":	SimiCharSet(0x3130, 0x318F, "Hangul Compatibility Jamo", "韩文兼容性Jamo"),
    "Kanbun":	SimiCharSet(0x3190, 0x319F, "Kanbun", "汉文"),
    "Bopomofo Extended":	SimiCharSet(0x31A0, 0x31BF, "Bopomofo Extended", "汉语拼音扩展"),
    "Katakana Phonetic Extensions":	SimiCharSet(0x31F0, 0x31FF, "Katakana Phonetic Extensions", "片假名语音扩展"),
    "Enclosed CJK Letters and Months":	SimiCharSet(0x3200, 0x32FF, "Enclosed CJK Letters and Months", "CJK信件和月份扩展"),
    "CJK Compatibility":	SimiCharSet(0x3300, 0x33FF, "CJK Compatibility", "CJK兼容性"),
    "CJK Unified Ideographs Extension A":	SimiCharSet(0x3400, 0x4DBF, "CJK Unified Ideographs Extension A", "中日韩统一表意文字扩展A"),
    "Yijing Hexagram Symbols":	SimiCharSet(0x4DC0, 0x4DFF, "Yijing Hexagram Symbols", "Yijing Hexagram符号"),
    "CJK Unified Ideographs":	SimiCharSet(0x4E00, 0x9FFF, "CJK Unified Ideographs", "CJK统一表意文字"),
    "Yi Syllables":	SimiCharSet(0xA000, 0xA48F, "Yi Syllables", "彝族音节"),
    "Yi Radicals":	SimiCharSet(0xA490, 0xA4CF, "Yi Radicals", "Yi Radicals"),
    "Hangul Syllables":	SimiCharSet(0xAC00, 0xD7AF, "Hangul Syllables", "韩语音节"),
    "High Surrogates":	SimiCharSet(0xD800, 0xDB7F, "High Surrogates", "高代理人"),
    "High Private Use Surrogates":	SimiCharSet(0xDB80, 0xDBFF, "High Private Use Surrogates", "高级私人使用代理人"),
    "Low Surrogates":	SimiCharSet(0xDC00, 0xDFFF, "Low Surrogates", "低代理人"),
    "Private Use Area":	SimiCharSet(0xE000, 0xF8FF, "Private Use Area", "私人使用区"),
    "CJK Compatibility Ideographs":	SimiCharSet(0xF900, 0xFAFF, "CJK Compatibility Ideographs", "CJK兼容性表意文字"),
    "Alphabetic Presentation Forms":	SimiCharSet(0xFB00, 0xFB4F, "Alphabetic Presentation Forms", "字母表示形式"),
    "Arabic Presentation Forms-A":	SimiCharSet(0xFB50, 0xFDFF, "Arabic Presentation Forms-A", "阿拉伯语演讲表格"),
    "Variation Selectors":	SimiCharSet(0xFE00, 0xFE0F, "Variation Selectors", "变异选择器"),
    "Combining Half Marks":	SimiCharSet(0xFE20, 0xFE2F, "Combining Half Marks", "结合半标记"),
    "CJK Compatibility Forms":	SimiCharSet(0xFE30, 0xFE4F, "CJK Compatibility Forms", "CJK兼容性形式"),
    "Small Form Variants":	SimiCharSet(0xFE50, 0xFE6F, "Small Form Variants", "小型变体"),
    "Arabic Presentation Forms-B":	SimiCharSet(0xFE70, 0xFEFF, "Arabic Presentation Forms-B", "阿拉伯语演讲形式"),
    "Halfwidth and Fullwidth Forms":	SimiCharSet(0xFF00, 0xFFEF, "Halfwidth and Fullwidth Forms", "半宽和全宽形式"),
    "Specials":	SimiCharSet(0xFFF0, 0xFFFF, "Specials", "特殊符号"),
    # "Linear B Syllabary":	SimiCharSet(0x10000, 0x1007F, "Linear B Syllabary", "线性B音节"),
    # "Linear B Ideograms":	SimiCharSet(0x10080, 0x100FF, "Linear B Ideograms", "线性B表意文字"),
    # "Aegean Numbers":	SimiCharSet(0x10100, 0x1013F, "Aegean Numbers", "爱琴海数字"),
    # "Old Italic":	SimiCharSet(0x10300, 0x1032F, "Old Italic", "旧意大利语"),
    # "Gothic":	SimiCharSet(0x10330, 0x1034F, "Gothic", "哥特"),
    # "Ugaritic":	SimiCharSet(0x10380, 0x1039F, "Ugaritic", "乌加里特"),
    # "Deseret":	SimiCharSet(0x10400, 0x1044F, "Deseret", "犹他州"),
    # "Shavian":	SimiCharSet(0x10450, 0x1047F, "Shavian", "萧伯纳"),
    # "Osmanya":	SimiCharSet(0x10480, 0x104AF, "Osmanya", "奥斯曼亚语"),
    # "Cypriot Syllabary":	SimiCharSet(0x10800, 0x1083F, "Cypriot Syllabary", "塞浦路斯音节"),
    # "Byzantine Musical Symbols":	SimiCharSet(0x1D000, 0x1D0FF, "Byzantine Musical Symbols", "拜占庭音乐符号"),
    # "Musical Symbols":	SimiCharSet(0x1D100, 0x1D1FF, "Musical Symbols", "音乐符号"),
    # "Tai Xuan Jing Symbols":	SimiCharSet(0x1D300, 0x1D35F, "Tai Xuan Jing Symbols", "太玄景符号"),
    # "Mathematical Alphanumeric Symbols":	SimiCharSet(0x1D400, 0x1D7FF, "Mathematical Alphanumeric Symbols", "数学字母数字符号"),
    # "CJK Unified Ideographs Extension B":	SimiCharSet(0x20000, 0x2A6DF, "CJK Unified Ideographs Extension B", "CJK统一表意文字扩展B"),
    # "CJK Compatibility Ideographs Supplement":	SimiCharSet(0x2F800, 0x2FA1F, "CJK Compatibility Ideographs Supplement", "CJK兼容性表意文字补充"),
    # "Tags":	SimiCharSet(0xE0000, 0xE007F, "Tags", "标签")
})

charset_dictionary.update({
    "Latin": CharSet([
        charset_dictionary["Basic Latin"],
        charset_dictionary["Latin-1 Supplement"],
        charset_dictionary["Latin Extended-A"],
        charset_dictionary["Latin Extended-B"],
        charset_dictionary["Latin Extended Additional"],
    ], "Latin", "拉丁语"),
    "Functional Character" : CharSet([
        charset_dictionary["Spacing Modifier Letters"],
        charset_dictionary["Combining Diacritical Marks"],
    ], "Functional Character", "功能性字母"),
    "Japanese": CharSet([
        charset_dictionary["CJK Symbols and Punctuation"],
        charset_dictionary["Hiragana"],
        charset_dictionary["Katakana"],
        charset_dictionary["Katakana Phonetic Extensions"],
        charset_dictionary["Enclosed CJK Letters and Months"],
        charset_dictionary["CJK Compatibility"],
        charset_dictionary["CJK Unified Ideographs Extension A"],
        charset_dictionary["CJK Compatibility Ideographs"]
    ], "Japanese", "日语"),
    "Chinese": CharSet([
        charset_dictionary["CJK Radicals Supplement"],
        charset_dictionary["Kangxi Radicals"],
        charset_dictionary["CJK Symbols and Punctuation"],
        charset_dictionary["Bopomofo"],
        charset_dictionary["Kanbun"],
        charset_dictionary["Bopomofo Extended"],
        charset_dictionary["CJK Unified Ideographs Extension A"],
        charset_dictionary["Yijing Hexagram Symbols"],
        charset_dictionary["CJK Unified Ideographs"],
        charset_dictionary["CJK Compatibility Ideographs"],
    ], "Chinese", "中文"),
    "Yi": CharSet([
        charset_dictionary["Yi Syllables"],
        charset_dictionary["Yi Radicals"],
    ], "Yi", "彝族语言"),
    "Hangul": CharSet([
        charset_dictionary["Hangul Syllables"],
        charset_dictionary["Hangul Compatibility Jamo"],
    ], "Hangul", "韩语"),
    "Arabic": CharSet([
        charset_dictionary["Arabic Presentation Forms-A"],
        charset_dictionary["Arabic Presentation Forms-B"],
    ], "Arabic", "阿拉伯语"),
})

charset_dictionary.update({
    "Alphabet Language": CharSet([
        charset_dictionary["Latin"],
        charset_dictionary["Functional Character"],
        charset_dictionary["Greek and Coptic"],
        charset_dictionary["Cyrillic"],
        charset_dictionary["Cyrillic Supplementary"],
        charset_dictionary["Armenian"],
        charset_dictionary["Hebrew"],
        charset_dictionary["Arabic"],
        charset_dictionary["Syriac"],
        charset_dictionary["Thaana"],
        charset_dictionary["Devanagari"],
        charset_dictionary["Bengali"],
        charset_dictionary["Gurmukhi"],
        charset_dictionary["Gujarati"],
        charset_dictionary["Oriya"],
        charset_dictionary["Tamil"],
        charset_dictionary["Telugu"],
        charset_dictionary["Kannada"],
        charset_dictionary["Malayalam"],
        charset_dictionary["Sinhala"],
        charset_dictionary["Thai"],
        charset_dictionary["Lao"],
        charset_dictionary["Tibetan"],
        charset_dictionary["Myanmar"],
        charset_dictionary["Georgian"],
        charset_dictionary["Hangul Jamo"],
        charset_dictionary["Ethiopic"],
        charset_dictionary["Cherokee"],
        charset_dictionary["Unified Canadian Aboriginal Syllabics"],
        charset_dictionary["Ogham"],
        charset_dictionary["Runic"],
        charset_dictionary["Tagalog"],
        charset_dictionary["Hanunoo"],
        charset_dictionary["Buhid"],
        charset_dictionary["Tagbanwa"],
        charset_dictionary["Khmer"],
        charset_dictionary["Mongolian"],
        charset_dictionary["Limbu"],
        charset_dictionary["Tai Le"],
        charset_dictionary["Khmer Symbols"],
        charset_dictionary["Phonetic Extensions"],
        charset_dictionary["Latin Extended Additional"],
        charset_dictionary["Greek Extended"],
        charset_dictionary["General Punctuation"],
        charset_dictionary["Arabic"],
    ], "Alphabet Language", "字母语言"),
    "CJK": CharSet([
        charset_dictionary["Chinese"],
        charset_dictionary["Japanese"],
        charset_dictionary["Yi"],
        charset_dictionary["Hangul"],
    ], "CJK", "中日韩"),
})

charset_dictionary.update({
    "Languages": CharSet([
        charset_dictionary["Alphabet Language"],
        charset_dictionary["CJK"],
        charset_dictionary["Braille Patterns"],
    ], "Languages", "语言"),
    "Math": CharSet([
        charset_dictionary["Superscripts and Subscripts"],
        charset_dictionary["Mathematical Operators"],
        charset_dictionary["Miscellaneous Technical"],
        charset_dictionary["Enclosed Alphanumerics"],
        charset_dictionary["Miscellaneous Mathematical Symbols-A"],
        charset_dictionary["Miscellaneous Mathematical Symbols-B"],
        charset_dictionary["Supplemental Mathematical Operators"],
    ], "Math", "数学"),
    "Symbols": CharSet([
        charset_dictionary["Currency Symbols"],
        charset_dictionary["Combining Diacritical Marks for Symbols"],
        charset_dictionary["Letterlike Symbols"],
        charset_dictionary["Number Forms"],
        charset_dictionary["Arrows"],
        charset_dictionary["Control Pictures"],
        charset_dictionary["Optical Character Recognition"],
        charset_dictionary["Box Drawing"],
        charset_dictionary["Block Elements"],
        charset_dictionary["Geometric Shapes"],
        charset_dictionary["Miscellaneous Symbols"],
        charset_dictionary["Dingbats"],
        charset_dictionary["Supplemental Arrows-A"],
        charset_dictionary["Supplemental Arrows-B"],
        charset_dictionary["Miscellaneous Symbols and Arrows"],
    ], "Symbols", "符号"),
})

charset_dictionary.update({
    "All": CharSet([
        charset_dictionary["Languages"],
        charset_dictionary["Math"],
        charset_dictionary["Symbols"],
    ], "All", "全部"),
})

if __name__ == '__main__':
    print(charset_dictionary['All'].tree())
