#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Liu Yang <mkliuyang@gmail.com>
import re
from collections import Counter
from typing import List, Any, Dict

widths = [
    (126, 1), (159, 0), (687, 1), (710, 0), (711, 1),
    (727, 0), (733, 1), (879, 0), (1154, 1), (1161, 0),
    (4347, 1), (4447, 2), (7467, 1), (7521, 0), (8369, 1),
    (8426, 0), (9000, 1), (9002, 2), (11021, 1), (12350, 2),
    (12351, 1), (12438, 2), (12442, 0), (19893, 2), (19967, 1),
    (55203, 2), (63743, 1), (64106, 2), (65039, 1), (65059, 0),
    (65131, 2), (65279, 1), (65376, 2), (65500, 1), (65510, 2),
    (120831, 1), (262141, 2), (1114109, 1),
]


def get_width(o):
    """Return the screen column width for unicode ordinal o."""
    global widths
    if o == 0xe or o == 0xf:
        return 0
    for num, wid in widths:
        if o <= num:
            return wid
    return 1


def get_string_width(s):
    return sum(get_width(ord(c)) for c in s)


class SentenceSpliter(object):
    """
    对生文本（中文）进行分句。一般是从word文档、pdf文档中直接得到的文本。
    """
    def _merge_paragraph(self, lines):
        ret = []
        for idx, line in enumerate(lines):
            if idx == 0:
                ret.append(line)
                continue
            elif ret[-1] == '':
                ret.append(line)
            else:
                ret[-1] += line
        return ret

    def split(self, file_data: str) -> List[str]:
        lines = self._split_by_new_line(file_data)
        lines = self._merge_paragraph(lines)
        return lines

    @staticmethod
    def _split_by_new_line(file_data: str) -> List[str]:
        return re.split('\r?\n', file_data)
