Unicode encoding range of Chinese characters
Unicode encoding range
u4e00 ~ u9fff
U + 4e00 ~ U + 9fa5 is the most commonly used range, that is, the block named CJK unified ideographs. The characters between U + 9fa6 ~ U + 9fff are still empty codes, which have not been defined yet, but there is no guarantee that they will not be defined in the future
def is_zh(char):
"""
:param char: Single character
:return:
"""
if u'\u4e00' <= char <= u'\u9fff':
return True
return False
Statistics of Chinese and English words
In word document, Review – > count the number of words
can calculate the number of words, Chinese words, non Chinese words and so on, now use Python to achieve
example: Hello, world 4
# -*- coding: utf-8 -*-
import re
def strQ2B(ustring):
# Full to half angle string
rstring = ""
for uchar in ustring:
inside_code = ord(uchar)
if inside_code == 12288: # convert full-corner spaces directly
inside_code = 32
elif (inside_code >= 65281 and inside_code <= 65374): # Full-angle characters (except spaces) are converted according to the relationship
inside_code -= 65248
rstring += chr(inside_code)
return rstring
def querySimpleProcess(ss):
# query preprocessing, excluding characters other than Chinese and English numbers, all converted to lowercase
s1=strQ2B(ss)
s2=re.sub(r"(?![\u4e00-\u9fff]|[0-9a-zA-Z])."," ",s1)
s3=re.sub(r"\s+"," ",s2)
return s3.strip().lower()
# Determine if it contains Chinese
def check_contain_chinese(check_str):
for ch in check_str:
if u'\u4e00' <= ch <= u'\u9fff':
return True
return False
# Determine if it contains English
def check_contain_english(check_str):
for ch in check_str:
if u'a' <= ch <= u'z' or u'A' <= ch <= u'Z':
return True
return False
# Delete letters from a string for character counting purposes
def delete_letters(ss):
rs = re.sub(r"[a-zA-Z]+","",ss)
return rs
# First space split, get the list, and then line processing each element in the list
###Example: Smart School Uniform Commercial=6, Disrespectful Breakup=2
### Exception: C Mile C Mile=3 ### Can't handle
# If the element does not contain Chinese, then the length of the element is recorded as: 1 + the number of digits
# If the element does not contain English, the length of the element is recorded as: the number of Chinese characters + the number of digits, you can directly use the len () method
# If the element contains both English and Chinese, the length of the element is recorded as: number of Chinese characters + number of digits + 1
def countCharacters(inputStr):
tmpStr = querySimpleProcess(inputStr)
str2list = tmpStr.strip().split(" ")
if len(str2list) > 0:
charsNum = 0 # Initialize character count
for elem in str2list:
chineseFlag = check_contain_chinese(elem)
englishFlag = check_contain_english(elem)
if englishFlag == False: # no English
charsNum = charsNum + len(elem)
continue
else: # contain English
elem = delete_letters(elem)
charsNum = charsNum + 1 + len(elem)
return charsNum
return 0