Files
opencafe/cafe/common/unicode.py
Jose Idar ecae985fa4 Adds methods for working with the unicode codepoint space
Change-Id: If2e5c3232094116b8a9b71b1981fe3418aaceebc
2013-07-08 17:55:42 -05:00

565 lines
23 KiB
Python

"""
@see: http://en.wikipedia.org/wiki/Unicode#Architecture_and_terminology
Imports:
unicodedata
Classes:
UnicodeRange
A UnicodeRange object contains a start, end, and name attribute
which normally corresponds to the start and end integer for a
range of Unicode codepoints.
Each UnicodeRange object includes generators for performing common
functions on the codepoints in that integer range, such as:
codepoints(): yeilds every integer in that Block's range.
codepoint_names(): yeilds the Unicode name of every codepoint
integer in the Block's range.
encoded_codepoints(): yeilds an encoded (UTF-8 by default)
string representation of the character
the codepoint represents.
UnicodeRangeList
A list-like object normally made up of UnicodeRange objects.
Meant as a container for containing large, and/or disjointed ranges
Includes definitions for the UnicodeRange object methods
codepoints(), codepoint_names(), and encoded_codepoints() so that
a user can still iterate through the codepoints in the entire list,
even if the ranges are disjointed. This allows for creating
custom ranges for specific testing.
Constants:
UNICODE_BLOCKS
A list-like object (UnicodeRangeList) made up of
UnicodeRange objects. Each UnicodeRange object in the list
corresponds to a named Unicode Block, and contains the start
and end integer for that Block.
UNICODE_PLANES
A list-like object (UnicodeRangeList) made up of UnicodeRange
objects. It covers the same total range as UNICODE_BLOCKS, but is
instead organized by plane names instead of block names, which
results in fewer but larger ranges.
UNICODE_STARTING_CODEPOINT
Integer denoting the first unicode codepoint
UNICODE_ENDING_CODEPOINT
Integer denoting the last unicode codepoint
PLANE_NAMES
Namespace (class) containing enums of all Unicode Plane names as
strings.
BLOCK_NAMES
Namespace (class) containing enums of all Unicode Block names as
strings.
Usage Exmaples:
Print all the characters in the "Thai" unicode block:
for c in UNICODE_BLOCKS.get_range(BLOCK_NAMES.thai).encoded_codepoints():
print c
Iterate through all the integer codepoints in the "Thai" unicode block:
for i in UNICODE_BLOCKS.get_range(BLOCK_NAMES.thai).codepoints():
do_something(i)
Get a list of the names of all the characters in the "Thai" unicode block:
[n for n in UNICODE_BLOCKS.get_range(BLOCK_NAMES.thai).codepoint_names()]
"""
import unicodedata
# Integer denoting the first unicode codepoint
UNICODE_STARTING_CODEPOINT = 0x0
# Integer denoting the last unicode codepoint
UNICODE_ENDING_CODEPOINT = 0x10FFFD
# list-like object that iterates through named ranges of unicode codepoints
# Instantiated at runtime (when imported) near the bottom of this file
UNICODE_BLOCKS = None
# list-like object that iterates through ranges of ranges of unicode codepoints
# Instantiated at runtime (when imported) near the bottom of this file
UNICODE_PLANES = None
class PLANE_NAMES(object):
"""Namespace that defines all standard Unicode Plane names"""
basic_multilingual_plane = 'Basic Multilingual Plane'
supplementary_multilingual_plane = 'Supplementary Multilingual Plane'
supplementary_ideographic_plane = 'Supplementary Ideographic Plane'
unassigned = 'Unassigned'
supplementary_special_purpose_plane = 'Supplementary Special-purpose Plane'
supplementary_private_use_area = 'Supplementary Private Use Area'
class BLOCK_NAMES(object):
"""Namespace that defines all standard Unicode Block names"""
basic_latin = "Basic Latin"
c1_controls_and_latin_1_supplement = "C1 Controls and Latin-1 Supplement"
latin_extended_a = "Latin Extended-A"
latin_extended_b = "Latin Extended-B"
ipa_extensions = "IPA Extensions"
spacing_modifier_letters = "Spacing Modifier Letters"
combining_diacritical_marks = "Combining Diacritical Marks"
greek_coptic = "Greek_Coptic"
cyrillic = "Cyrillic"
cyrillic_supplement = "Cyrillic Supplement"
armenian = "Armenian"
hebrew = "Hebrew"
arabic = "Arabic"
syriac = "Syriac"
undefined = "Undefined"
thaana = "Thaana"
devanagari = "Devanagari"
bengali_assamese = "Bengali_Assamese"
gurmukhi = "Gurmukhi"
gujarati = "Gujarati"
oriya = "Oriya"
tamil = "Tamil"
telugu = "Telugu"
kannada = "Kannada"
malayalam = "Malayalam"
sinhala = "Sinhala"
thai = "Thai"
lao = "Lao"
tibetan = "Tibetan"
myanmar = "Myanmar"
georgian = "Georgian"
hangul_jamo = "Hangul Jamo"
ethiopic = "Ethiopic"
cherokee = "Cherokee"
unified_canadian_aboriginal_syllabics = (
"Unified Canadian Aboriginal Syllabics")
ogham = "Ogham"
runic = "Runic"
tagalog = "Tagalog"
hanunoo = "Hanunoo"
buhid = "Buhid"
tagbanwa = "Tagbanwa"
khmer = "Khmer"
mongolian = "Mongolian"
limbu = "Limbu"
tai_le = "Tai Le"
khmer_symbols = "Khmer Symbols"
phonetic_extensions = "Phonetic Extensions"
latin_extended_additional = "Latin Extended Additional"
greek_extended = "Greek Extended"
general_punctuation = "General Punctuation"
superscripts_and_subscripts = "Superscripts and Subscripts"
currency_symbols = "Currency Symbols"
combining_diacritical_marks_for_symbols = (
"Combining Diacritical Marks for Symbols")
letterlike_symbols = "Letterlike Symbols"
number_forms = "Number Forms"
arrows = "Arrows"
mathematical_operators = "Mathematical Operators"
miscellaneous_technical = "Miscellaneous Technical"
control_pictures = "Control Pictures"
optical_character_recognition = "Optical Character Recognition"
enclosed_alphanumerics = "Enclosed Alphanumerics"
box_drawing = "Box Drawing"
block_elements = "Block Elements"
geometric_shapes = "Geometric Shapes"
miscellaneous_symbols = "Miscellaneous Symbols"
dingbats = "Dingbats"
miscellaneous_mathematical_symbols_a = (
"Miscellaneous Mathematical Symbols-A")
supplemental_arrows_a = "Supplemental Arrows-A"
braille_patterns = "Braille Patterns"
supplemental_arrows_b = "Supplemental Arrows-B"
miscellaneous_mathematical_symbols_b = (
"Miscellaneous Mathematical Symbols-B")
supplemental_mathematical_operators = "Supplemental Mathematical Operators"
miscellaneous_symbols_and_arrows = "Miscellaneous Symbols and Arrows"
cjk_radicals_supplement = "CJK Radicals Supplement"
kangxi_radicals = "Kangxi Radicals"
ideographic_description_characters = "Ideographic Description Characters"
cjk_symbols_and_punctuation = "CJK Symbols and Punctuation"
hiragana = "Hiragana"
katakana = "Katakana"
bopomofo = "Bopomofo"
hangul_compatibility_jamo = "Hangul Compatibility Jamo"
kanbun_kunten = "Kanbun (Kunten)"
bopomofo_extended = "Bopomofo Extended"
katakana_phonetic_extensions = "Katakana Phonetic Extensions"
enclosed_cjk_letters_and_months = "Enclosed CJK Letters and Months"
cjk_compatibility = "CJK Compatibility"
cjk_unified_ideographs_extension_a = "CJK Unified Ideographs Extension A"
yijing_hexagram_symbols = "Yijing Hexagram Symbols"
cjk_unified_ideographs = "CJK Unified Ideographs"
yi_syllables = "Yi Syllables"
yi_radicals = "Yi Radicals"
hangul_syllables = "Hangul Syllables"
high_surrogate_area = "High Surrogate Area"
low_surrogate_area = "Low Surrogate Area"
private_use_area = "Private Use Area"
cjk_compatibility_ideographs = "CJK Compatibility Ideographs"
alphabetic_presentation_forms = "Alphabetic Presentation Forms"
arabic_presentation_forms_a = "Arabic Presentation Forms-A"
variation_selectors = "Variation Selectors"
combining_half_marks = "Combining Half Marks"
cjk_compatibility_forms = "CJK Compatibility Forms"
small_form_variants = "Small Form Variants"
arabic_presentation_forms_b = "Arabic Presentation Forms-B"
halfwidth_and_fullwidth_forms = "Halfwidth and Fullwidth Forms"
specials = "Specials"
linear_b_syllabary = "Linear B Syllabary"
linear_b_ideograms = "Linear B Ideograms"
aegean_numbers = "Aegean Numbers"
old_italic = "Old Italic"
gothic = "Gothic"
ugaritic = "Ugaritic"
deseret = "Deseret"
shavian = "Shavian"
osmanya = "Osmanya"
cypriot_syllabary = "Cypriot Syllabary"
byzantine_musical_symbols = "Byzantine Musical Symbols"
musical_symbols = "Musical Symbols"
tai_xuan_jing_symbols = "Tai Xuan Jing Symbols"
mathematical_alphanumeric_symbols = "Mathematical Alphanumeric Symbols"
cjk_unified_ideographs_extension_b = "CJK Unified Ideographs Extension B"
cjk_compatibility_ideographs_supplement = (
"CJK Compatibility Ideographs Supplement")
unused = "Unused"
tags = "Tags"
variation_selectors_supplement = "Variation Selectors Supplement"
supplementary_private_use_area_a = "Supplementary Private Use Area-A"
supplementary_private_use_area_b = "Supplementary Private Use Area-B"
_unicode_planes = (
(0x0, 0xffff, PLANE_NAMES.basic_multilingual_plane),
(0x10000, 0x1ffff, PLANE_NAMES.supplementary_multilingual_plane),
(0x20000, 0x2ffff, PLANE_NAMES.supplementary_ideographic_plane),
(0x30000, 0xdffff, PLANE_NAMES.unassigned),
(0xe0000, 0xeffff, PLANE_NAMES.supplementary_special_purpose_plane),
(0xf0000, 0x10ffff, PLANE_NAMES.supplementary_private_use_area))
_unicode_blocks = (
(0x0, 0x7f, BLOCK_NAMES.basic_latin),
(0x80, 0xff, BLOCK_NAMES.c1_controls_and_latin_1_supplement),
(0x100, 0x17f, BLOCK_NAMES.latin_extended_a),
(0x180, 0x24f, BLOCK_NAMES.latin_extended_b),
(0x250, 0x2af, BLOCK_NAMES.ipa_extensions),
(0x2b0, 0x2ff, BLOCK_NAMES.spacing_modifier_letters),
(0x300, 0x36f, BLOCK_NAMES.combining_diacritical_marks),
(0x370, 0x3ff, BLOCK_NAMES.greek_coptic),
(0x400, 0x4ff, BLOCK_NAMES.cyrillic),
(0x500, 0x52f, BLOCK_NAMES.cyrillic_supplement),
(0x530, 0x58f, BLOCK_NAMES.armenian),
(0x590, 0x5ff, BLOCK_NAMES.hebrew),
(0x600, 0x6ff, BLOCK_NAMES.arabic),
(0x700, 0x74f, BLOCK_NAMES.syriac),
(0x750, 0x77f, BLOCK_NAMES.undefined),
(0x780, 0x7bf, BLOCK_NAMES.thaana),
(0x7c0, 0x8ff, BLOCK_NAMES.undefined),
(0x900, 0x97f, BLOCK_NAMES.devanagari),
(0x980, 0x9ff, BLOCK_NAMES.bengali_assamese),
(0xa00, 0xa7f, BLOCK_NAMES.gurmukhi),
(0xa80, 0xaff, BLOCK_NAMES.gujarati),
(0xb00, 0xb7f, BLOCK_NAMES.oriya),
(0xb80, 0xbff, BLOCK_NAMES.tamil),
(0xc00, 0xc7f, BLOCK_NAMES.telugu),
(0xc80, 0xcff, BLOCK_NAMES.kannada),
(0xd00, 0xdff, BLOCK_NAMES.malayalam),
(0xd80, 0xdff, BLOCK_NAMES.sinhala),
(0xe00, 0xe7f, BLOCK_NAMES.thai),
(0xe80, 0xeff, BLOCK_NAMES.lao),
(0xf00, 0xfff, BLOCK_NAMES.tibetan),
(0x1000, 0x109f, BLOCK_NAMES.myanmar),
(0x10a0, 0x10ff, BLOCK_NAMES.georgian),
(0x1100, 0x11ff, BLOCK_NAMES.hangul_jamo),
(0x1200, 0x137f, BLOCK_NAMES.ethiopic),
(0x1380, 0x139f, BLOCK_NAMES.undefined),
(0x13a0, 0x13ff, BLOCK_NAMES.cherokee),
(0x1400, 0x167f,
BLOCK_NAMES.unified_canadian_aboriginal_syllabics),
(0x1680, 0x169f, BLOCK_NAMES.ogham),
(0x16a0, 0x16ff, BLOCK_NAMES.runic),
(0x1700, 0x171f, BLOCK_NAMES.tagalog),
(0x1720, 0x173f, BLOCK_NAMES.hanunoo),
(0x1740, 0x175f, BLOCK_NAMES.buhid),
(0x1760, 0x177f, BLOCK_NAMES.tagbanwa),
(0x1780, 0x17ff, BLOCK_NAMES.khmer),
(0x1800, 0x18af, BLOCK_NAMES.mongolian),
(0x18b0, 0x18ff, BLOCK_NAMES.undefined),
(0x1900, 0x194f, BLOCK_NAMES.limbu),
(0x1950, 0x197f, BLOCK_NAMES.tai_le),
(0x1980, 0x19df, BLOCK_NAMES.undefined),
(0x19e0, 0x19ff, BLOCK_NAMES.khmer_symbols),
(0x1a00, 0x1cff, BLOCK_NAMES.undefined),
(0x1d00, 0x1d7f, BLOCK_NAMES.phonetic_extensions),
(0x1d80, 0x1dff, BLOCK_NAMES.undefined),
(0x1e00, 0x1eff, BLOCK_NAMES.latin_extended_additional),
(0x1f00, 0x1fff, BLOCK_NAMES.greek_extended),
(0x2000, 0x206f, BLOCK_NAMES.general_punctuation),
(0x2070, 0x209f, BLOCK_NAMES.superscripts_and_subscripts),
(0x20a0, 0x20cf, BLOCK_NAMES.currency_symbols),
(0x20d0, 0x20ff,
BLOCK_NAMES.combining_diacritical_marks_for_symbols),
(0x2100, 0x214f, BLOCK_NAMES.letterlike_symbols),
(0x2150, 0x218f, BLOCK_NAMES.number_forms),
(0x2190, 0x21ff, BLOCK_NAMES.arrows),
(0x2200, 0x22ff, BLOCK_NAMES.mathematical_operators),
(0x2300, 0x23ff, BLOCK_NAMES.miscellaneous_technical),
(0x2400, 0x243f, BLOCK_NAMES.control_pictures),
(0x2440, 0x245f, BLOCK_NAMES.optical_character_recognition),
(0x2460, 0x24ff, BLOCK_NAMES.enclosed_alphanumerics),
(0x2500, 0x257f, BLOCK_NAMES.box_drawing),
(0x2580, 0x259f, BLOCK_NAMES.block_elements),
(0x25a0, 0x25ff, BLOCK_NAMES.geometric_shapes),
(0x2600, 0x26ff, BLOCK_NAMES.miscellaneous_symbols),
(0x2700, 0x27bf, BLOCK_NAMES.dingbats),
(0x27c0, 0x27ef, BLOCK_NAMES.miscellaneous_mathematical_symbols_a),
(0x27f0, 0x27ff, BLOCK_NAMES.supplemental_arrows_a),
(0x2800, 0x28ff, BLOCK_NAMES.braille_patterns),
(0x2900, 0x297f, BLOCK_NAMES.supplemental_arrows_b),
(0x2980, 0x29ff, BLOCK_NAMES.miscellaneous_mathematical_symbols_b),
(0x2a00, 0x2aff, BLOCK_NAMES.supplemental_mathematical_operators),
(0x2b00, 0x2bff, BLOCK_NAMES.miscellaneous_symbols_and_arrows),
(0x2c00, 0x2e7f, BLOCK_NAMES.undefined),
(0x2e80, 0x2eff, BLOCK_NAMES.cjk_radicals_supplement),
(0x2f00, 0x2fdf, BLOCK_NAMES.kangxi_radicals),
(0x2fe0, 0x2fef, BLOCK_NAMES.undefined),
(0x2ff0, 0x2fff, BLOCK_NAMES.ideographic_description_characters),
(0x3000, 0x303f, BLOCK_NAMES.cjk_symbols_and_punctuation),
(0x3040, 0x309f, BLOCK_NAMES.hiragana),
(0x30a0, 0x30ff, BLOCK_NAMES.katakana),
(0x3100, 0x312f, BLOCK_NAMES.bopomofo),
(0x3130, 0x318f, BLOCK_NAMES.hangul_compatibility_jamo),
(0x3190, 0x319f, BLOCK_NAMES.kanbun_kunten),
(0x31a0, 0x31bf, BLOCK_NAMES.bopomofo_extended),
(0x31c0, 0x31ef, BLOCK_NAMES.undefined),
(0x31f0, 0x31ff, BLOCK_NAMES.katakana_phonetic_extensions),
(0x3200, 0x32ff, BLOCK_NAMES.enclosed_cjk_letters_and_months),
(0x3300, 0x33ff, BLOCK_NAMES.cjk_compatibility),
(0x3400, 0x4dbf, BLOCK_NAMES.cjk_unified_ideographs_extension_a),
(0x4dc0, 0x4dff, BLOCK_NAMES.yijing_hexagram_symbols),
(0x4e00, 0x9faf, BLOCK_NAMES.cjk_unified_ideographs),
(0x9fb0, 0x9fff, BLOCK_NAMES.undefined),
(0xa000, 0xa48f, BLOCK_NAMES.yi_syllables),
(0xa490, 0xa4cf, BLOCK_NAMES.yi_radicals),
(0xa4d0, 0xabff, BLOCK_NAMES.undefined),
(0xac00, 0xd7af, BLOCK_NAMES.hangul_syllables),
(0xd7b0, 0xd7ff, BLOCK_NAMES.undefined),
(0xd800, 0xdbff, BLOCK_NAMES.high_surrogate_area),
(0xdc00, 0xdfff, BLOCK_NAMES.low_surrogate_area),
(0xe000, 0xf8ff, BLOCK_NAMES.private_use_area),
(0xf900, 0xfaff, BLOCK_NAMES.cjk_compatibility_ideographs),
(0xfb00, 0xfb4f, BLOCK_NAMES.alphabetic_presentation_forms),
(0xfb50, 0xfdff, BLOCK_NAMES.arabic_presentation_forms_a),
(0xfe00, 0xfe0f, BLOCK_NAMES.variation_selectors),
(0xfe10, 0xfe1f, BLOCK_NAMES.undefined),
(0xfe20, 0xfe2f, BLOCK_NAMES.combining_half_marks),
(0xfe30, 0xfe4f, BLOCK_NAMES.cjk_compatibility_forms),
(0xfe50, 0xfe6f, BLOCK_NAMES.small_form_variants),
(0xfe70, 0xfeff, BLOCK_NAMES.arabic_presentation_forms_b),
(0xff00, 0xffef, BLOCK_NAMES.halfwidth_and_fullwidth_forms),
(0xfff0, 0xffff, BLOCK_NAMES.specials),
(0x10000, 0x1007f, BLOCK_NAMES.linear_b_syllabary),
(0x10080, 0x100ff, BLOCK_NAMES.linear_b_ideograms),
(0x10100, 0x1013f, BLOCK_NAMES.aegean_numbers),
(0x10140, 0x102ff, BLOCK_NAMES.undefined),
(0x10300, 0x1032f, BLOCK_NAMES.old_italic),
(0x10330, 0x1034f, BLOCK_NAMES.gothic),
(0x10380, 0x1039f, BLOCK_NAMES.ugaritic),
(0x10400, 0x1044f, BLOCK_NAMES.deseret),
(0x10450, 0x1047f, BLOCK_NAMES.shavian),
(0x10480, 0x104af, BLOCK_NAMES.osmanya),
(0x104b0, 0x107ff, BLOCK_NAMES.undefined),
(0x10800, 0x1083f, BLOCK_NAMES.cypriot_syllabary),
(0x10840, 0x1cfff, BLOCK_NAMES.undefined),
(0x1d000, 0x1d0ff, BLOCK_NAMES.byzantine_musical_symbols),
(0x1d100, 0x1d1ff, BLOCK_NAMES.musical_symbols),
(0x1d200, 0x1d2ff, BLOCK_NAMES.undefined),
(0x1d300, 0x1d35f, BLOCK_NAMES.tai_xuan_jing_symbols),
(0x1d360, 0x1d3ff, BLOCK_NAMES.undefined),
(0x1d400, 0x1d7ff, BLOCK_NAMES.mathematical_alphanumeric_symbols),
(0x1d800, 0x1ffff, BLOCK_NAMES.undefined),
(0x20000, 0x2a6df, BLOCK_NAMES.cjk_unified_ideographs_extension_b),
(0x2a6e0, 0x2f7ff, BLOCK_NAMES.undefined),
(0x2f800, 0x2fa1f,
BLOCK_NAMES.cjk_compatibility_ideographs_supplement),
(0x2fab0, 0xdffff, BLOCK_NAMES.unused),
(0xe0000, 0xe007f, BLOCK_NAMES.tags),
(0xe0080, 0xe00ff, BLOCK_NAMES.unused),
(0xe0100, 0xe01ef, BLOCK_NAMES.variation_selectors_supplement),
(0xe01f0, 0xeffff, BLOCK_NAMES.unused),
(0xf0000, 0xffffd, BLOCK_NAMES.supplementary_private_use_area_a),
(0xffffe, 0xfffff, BLOCK_NAMES.unused),
(0x100000, 0x10fffd, BLOCK_NAMES.supplementary_private_use_area_b))
class UnicodeRange(object):
"""
Iterable representation of a range of unicode codepoints.
This can represent a standard Unicode Block, a standard Unicode Plane, or
even a custom range.
"""
def __init__(self, start, end, name):
self.name = name
self.start = start
self.end = end
def __str__(self):
return '{0} {1} {2}'.format(
hex(self.start), hex(self.end), str(self.name))
def codepoints(self):
"""
Generator that yields the the integer value of all codepoints in
UnicodeRange
"""
for codepoint in range(self.start, self.end + 1):
yield codepoint
def codepoint_names(self):
"""
Generator that yields the string name (if available) of all codepoints
in UnicodeRange
"""
for codepoint in self.codepoints():
yield codepoint_name(codepoint)
def encoded_codepoints(self, encoding='utf-8'):
"""
Generator that yields an <encoding> encoded unicode string
representation of all codepoints in UnicodeRange
"""
for codepoint in self.codepoints():
yield unichr(codepoint).encode(encoding)
class UnicodeRangeList(list):
"""
List-like collection of UnicodeRange objects.
Represents a set of Unicode codepoint ranges, such as a custom non-linear
set of ranges, or a Unicode Plane.
@TODO: Override constructor so that only UnicodeRange objects can be
appended or extended?
"""
def __str__(self):
ret_str = '['
for unicode_range in self:
ret_str = '{0}<{1}>, '.format(ret_str, str(unicode_range))
return '{0}]'.format(ret_str)
def codepoints(self):
"""
Generator that yields the the integer value of all codepoints in all
UnicodeRange objects in UnicodeRangeList
"""
for unicode_range in self:
for codepoint in unicode_range.codepoints():
yield codepoint
def codepoint_names(self):
"""
Generator that yields the string name (if available) of all codepoints
in all ranges in UnicodeRangeList.
If a name cannot be found, the codepoint's integer value is
returned in hexidecimal format as a string.
"""
for codepoint in self.codepoints():
yield codepoint_name(codepoint)
def encoded_codepoints(self, encoding='utf-8'):
"""
Generator that yields an <encoding> encoded unicode string
representation of all codepoints in all UnicodeRange objects in
UnicodeRangeList
"""
for codepoint in self.codepoints():
yield unichr(codepoint).encode(encoding)
def get_range(self, range_name):
"""
Expects a Unicode Block name as a string.
Returns a single UnicodeRange object representing the codepoints in the
unicode block range named range_name, if such a range exists in the
UnicodeRangeList, or None otherwise.
"""
for unicode_range in self:
if unicode_range.name == range_name:
return unicode_range
return None
def get_range_list(self, range_name_list):
"""
Expects a list of Unicode Block names as strings.
Returns a UnicodeRangeList of UnicodeRange objects representing the
codepoints in the unicode block ranges with names in range_name_list,
if any such unicode block ranges exist in the UnicodeRangeList, or an
empty UnicodeRangeList otherewise.
"""
range_list = UnicodeRangeList()
for unicode_range in self:
if unicode_range.name in range_name_list:
range_list.append(unicode_range)
return range_list
# Initialize UNICODE_BLOCKS 'constant'
UNICODE_BLOCKS = UnicodeRangeList()
for _start, _end, _name in _unicode_blocks:
UNICODE_BLOCKS.append(UnicodeRange(_start, _end, _name))
# Initialize UNICODE_PLANES 'constant'
UNICODE_PLANES = UnicodeRangeList()
for _start, _end, _name in _unicode_planes:
UNICODE_PLANES.append(UnicodeRange(_start, _end, _name))
def codepoint_parent_plane(codepoint_integer):
"""
Expects a Unicode codepoint as an integer.
Return a UnicodeRangeList of UnicodeRange objects representing the
unicode plane that codepoint_integer belongs to.
"""
for plane in UNICODE_PLANES:
if codepoint_integer >= plane.start and codepoint_integer <= plane.end:
return plane
def codepoint_parent_block(codepoint_integer):
"""
Expects a Unicode codepoint as an integer.
Return a UnicodeRange object representing the unicode block that
codepoint_integer belongs to.
"""
for block in UNICODE_BLOCKS:
if codepoint_integer >= block.start and codepoint_integer <= block.end:
return block
def codepoint_name(codepoint_integer):
"""
Expects a Unicode codepoint as an integer.
Returns the unicode name of codepoint_integer if valid unicode codepoint,
None otherwise
If a name cannot be found, the codepoint's integer value is
returned in hexidecimal format as a string.
"""
if (codepoint_integer < UNICODE_STARTING_CODEPOINT) or\
(codepoint_integer > (UNICODE_ENDING_CODEPOINT + 1)):
return None
return unicodedata.name(unichr(codepoint_integer), hex(codepoint_integer))