From 2bed7c58e5ef24860ad89242d7ea49c32959efa9 Mon Sep 17 00:00:00 2001 From: Alexander Zarubkin Date: Mon, 27 May 2024 13:35:09 +0300 Subject: [PATCH 1/3] Add Russian name official order handling which is enabled by additional configuration parameter that is False by default. Signed-off-by: Alexander Zarubkin --- nameparser/config/__init__.py | 39 ++++++++++------- nameparser/config/regexes.py | 10 ++++- nameparser/parser.py | 74 +++++++++++++++++++++++++++++++- tests.py | 81 +++++++++++++++++++++++++++++++++++ 4 files changed, 185 insertions(+), 19 deletions(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 7b2baef..01eb38f 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ The :py:mod:`nameparser.config` module manages the configuration of the -nameparser. +nameparser. A module-level instance of :py:class:`~nameparser.config.Constants` is created and used by default for all HumanName instances. You can adjust the entire module's @@ -25,7 +25,7 @@ >>> hn.parse_full_name() # need to run this again after config changes **Potential Gotcha**: If you do not pass ``None`` as the second argument, -``hn.C`` will be a reference to the module config, possibly yielding +``hn.C`` will be a reference to the module config, possibly yielding unexpected results. See `Customizing the Parser `_. """ from __future__ import unicode_literals @@ -57,7 +57,7 @@ class SetManager(Set): Only special functionality beyond that provided by set() is to normalize constants for comparison (lower case, no periods) - when they are add()ed and remove()d and allow passing multiple + when they are add()ed and remove()d and allow passing multiple string arguments to the :py:func:`add()` and :py:func:`remove()` methods. ''' @@ -125,7 +125,7 @@ def remove(self, *strings): class TupleManager(dict): ''' - A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants + A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants more friendly. ''' @@ -148,23 +148,23 @@ class Constants(object): """ An instance of this class hold all of the configuration constants for the parser. - :param set prefixes: + :param set prefixes: :py:attr:`prefixes` wrapped with :py:class:`SetManager`. - :param set titles: + :param set titles: :py:attr:`titles` wrapped with :py:class:`SetManager`. - :param set first_name_titles: + :param set first_name_titles: :py:attr:`~titles.FIRST_NAME_TITLES` wrapped with :py:class:`SetManager`. - :param set suffix_acronyms: + :param set suffix_acronyms: :py:attr:`~suffixes.SUFFIX_ACRONYMS` wrapped with :py:class:`SetManager`. - :param set suffix_not_acronyms: + :param set suffix_not_acronyms: :py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS` wrapped with :py:class:`SetManager`. - :param set conjunctions: + :param set conjunctions: :py:attr:`conjunctions` wrapped with :py:class:`SetManager`. :type capitalization_exceptions: tuple or dict - :param capitalization_exceptions: + :param capitalization_exceptions: :py:attr:`~capitalization.CAPITALIZATION_EXCEPTIONS` wrapped with :py:class:`TupleManager`. :type regexes: tuple or dict - :param regexes: + :param regexes: :py:attr:`regexes` wrapped with :py:class:`TupleManager`. """ @@ -187,9 +187,9 @@ class Constants(object): empty_attribute_default = '' """ Default return value for empty attributes. - + .. doctest:: - + >>> from nameparser.config import CONSTANTS >>> CONSTANTS.empty_attribute_default = None >>> name = HumanName("John Doe") @@ -197,7 +197,7 @@ class Constants(object): None >>>name.first 'John' - + """ capitalize_name = False @@ -231,6 +231,11 @@ class Constants(object): """ + try_russian_name_specifics = False + """ + If set, the parser will attempt to parse names in the Russian order (Last First Middle) + """ + def __init__(self, prefixes=PREFIXES, suffix_acronyms=SUFFIX_ACRONYMS, @@ -239,7 +244,8 @@ def __init__(self, first_name_titles=FIRST_NAME_TITLES, conjunctions=CONJUNCTIONS, capitalization_exceptions=CAPITALIZATION_EXCEPTIONS, - regexes=REGEXES + regexes=REGEXES, + try_russian_name_specifics=False, ): self.prefixes = SetManager(prefixes) self.suffix_acronyms = SetManager(suffix_acronyms) @@ -249,6 +255,7 @@ def __init__(self, self.conjunctions = SetManager(conjunctions) self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = TupleManager(regexes) + self.try_russian_name_specifics = try_russian_name_specifics self._pst = None @property diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index bd4b320..be48229 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -8,14 +8,14 @@ re_emoji = re.compile('[' '\U0001F300-\U0001F64F' '\U0001F680-\U0001F6FF' - '\u2600-\u26FF\u2700-\u27BF]+', + '\u2600-\u26FF\u2700-\u27BF]+', re.UNICODE) except re.error: # Narrow UCS-2 build re_emoji = re.compile('(' '\ud83c[\udf00-\udfff]|' '\ud83d[\udc00-\ude4f\ude80-\udeff]|' - '[\u2600-\u26FF\u2700-\u27BF])+', + '[\u2600-\u26FF\u2700-\u27BF])+', re.UNICODE) REGEXES = set([ @@ -31,6 +31,12 @@ ("period_not_at_end",re.compile(r'.*\..+$', re.I | re.U)), ("emoji",re_emoji), ("phd", re.compile(r'\s(ph\.?\s+d\.?)', re.I | re.U)), + ("russian_last_name_endings", re.compile(r'^.+(ov|ova|ev|eva|yov|yova|in|yn|ina|sky|skaya|ich|ych|uk|yuk|yk|ko|ak|ukh|ykh|ikh|chuk|yy|yi|oy|oi|iy|ii)$', re.I | re.U)), + ("russian_last_name_endings_cyrillic", re.compile(r'^.+(ов|ова|ев|ева|ёв|ёва|ин|ын|ина|ский|ская|цкая|цкий|ич|ыч|ук|юк|ык|ко|ак|ух|ых|их|чук|ый|ой|ий)$', re.I | re.U)), + ("russian_patronymic_endings", re.compile(r'^(.+(ovich|ovna|evich|evna|ichna))|(ilyich|kuzmich|lukich|fomich)$', re.I | re.U)), + ("russian_patronymic_endings_cyrillic", re.compile(r'^(.+(ович|овна|евич|евна|ична))|(ильич|кузьмич|лукич|фомич)$', re.I | re.U)), + ("turkic_patronymic_suffixes", re.compile(r'^(oglu|ogly|qizi|kizi|kyzy|gyzy|uly|uulu)$', re.I | re.U)), + ("turkic_patronymic_suffixes_cyrillic", re.compile(r'^(оглу|оглы|кызы|гызы|улы|уулу)$', re.I | re.U)), ]) """ All regular expressions used by the parser are precompiled and stored in the config. diff --git a/nameparser/parser.py b/nameparser/parser.py index a5eb352..c360847 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -512,6 +512,8 @@ def post_process(self): and :py:func:`handle_capitalization`. """ self.handle_firstnames() + if self.C.try_russian_name_specifics: + self.handle_russian_name_specifics() self.handle_capitalization() def fix_phd(self): @@ -568,6 +570,76 @@ def handle_firstnames(self): and not lc(self.title) in self.C.first_name_titles: self.last, self.first = self.first, self.last + def is_turkic_patronymic(self, piece): + return self.C.regexes.turkic_patronymic_suffixes.match(piece) or self.C.regexes.turkic_patronymic_suffixes_cyrillic.match(piece) + + def handle_russian_name_specifics(self): + # Russian name order may have a last name first, + # so the order will be Last First Middle instead of First Middle Last (but without comma!) + # We can deduce this by checking EITHER if the first name looks like a russian last name, + # (but it currently breaks on names like Martin or Franklin or Benjamin - hence extra config parameter) + # OR if the last name looks like a russian patronymic + # (but it will break on name without patronymic and foreign last name like Olurombi Alexey <- Last First order), + # Another case: Last First instead of First Last. Then middle is empty. + is_name_order_lfm = self.is_russian_last_name(self.first) or ( + # if the middle name also looks like a russian patronymic, then it's a First Middle Last order, + # e.g. Roman Alexeevich Abramovich <- Abramovich does look like patronymic, but it's really a last name + self.is_russian_patronymic(self.last) and not self.is_russian_patronymic(self.middle) + ) or ( # some Russian citizens have patronymics of turkic origin, e.g. Said Ogly + self.is_turkic_patronymic(self.last) + ) + + # rare case: last name consists of two or more words separated by space + # one of them got incorrectly parsed as first/middle name, + # Russian middle names are patronymics, and consist of one word only + if len(self.middle_list) > 1: + # exception to this rule: turkic origin patronymics (e.g. Said Ogly <- two pieces!) + if is_name_order_lfm: + if self.is_turkic_patronymic(self.last): + # e.g "Ahmedov Oktay Said Ogly" <- Said should be moved to Ogly + self.last_list = self.middle_list[1:] + self.last_list + self.middle_list = [self.middle_list[0]] + else: + # then the second word gets parsed as middle name (if the last name goes first in the user input) + # take all elements of middle_list except the last one and append them to first_list + # (it will be rotated to last_list) + self.first_list += self.middle_list[:-1] + # the last element of middle_list is the new middle name (will be rotated to first_list) + self.middle_list = [self.middle_list[-1]] + else: + if self.is_turkic_patronymic(self.middle_list[-1]): + pass # no specific treatment needed + else: + # if the last name goes last in the user input, then all parts except the last get parsed as middle name + # fix that + self.last_list = self.middle_list[1:] + self.last_list + self.middle_list = [self.middle_list[0]] + + if is_name_order_lfm: + # # which is parsed as last name but should be in middle name + if self.middle: + # rotate the name components + self.first, self.middle, self.last = self.middle, self.last, self.first + else: + self.first, self.last = self.last, self.first + + def is_russian_last_name(self, piece): + """ + If the last name ends in a slavic suffix, it's a last name. + """ + # some first names match these regexes, so we check them first + if piece.lower() in ['lev', 'eva', 'yacov', 'yakov', 'veniamin', + 'lyubov', 'lubov', 'nina', + 'лев', 'ева', 'яков', 'вениамин', + 'нина']: + return False + if self.C.regexes.russian_last_name_endings.match(piece) or self.C.regexes.russian_last_name_endings_cyrillic.match(piece): + return True + return False + + def is_russian_patronymic(self, piece): + return self.C.regexes.russian_patronymic_endings.match(piece) or self.C.regexes.russian_patronymic_endings_cyrillic.match(piece) + def parse_full_name(self): """ @@ -764,7 +836,7 @@ def parse_pieces(self, parts, additional_parts_count=0): titles = list(filter(self.is_title, period_chunks)) suffixes = list(filter(self.is_suffix, period_chunks)) - # add the part to the constant so it will be found + # add the part to the constant so it will be found if len(list(titles)): self.C.titles.add(part) continue diff --git a/tests.py b/tests.py index 2cdd526..06c66db 100644 --- a/tests.py +++ b/tests.py @@ -2387,6 +2387,87 @@ def test_constructor_multiple(self): self.m(hn.title, "mytitle", hn) +class RussianNameOrderTestCase(HumanNameTestBase): + C = Constants(try_russian_name_specifics=True) + + def test_russian_name_specific_order(self): + hn = HumanName("Zarubkin Alexander Sergeevich", constants=self.C) + self.m(hn.first, "Alexander", hn) + self.m(hn.middle, "Sergeevich", hn) + self.m(hn.last, "Zarubkin", hn) + + def test_specific_order_without_patronymic(self): + hn = HumanName("Zarubkin Alexander", constants=self.C) + self.m(hn.first, "Alexander", hn) + self.m(hn.last, "Zarubkin", hn) + + def test_last_name_with_dash_specific_order(self): + hn = HumanName("Blokin-Mechtalin Konstantin Yurievich", constants=self.C) + self.m(hn.first, "Konstantin", hn) + self.m(hn.middle, "Yurievich", hn) + self.m(hn.last, "Blokin-Mechtalin", hn) + + def test_russian_name_with_african_origin(self): + hn = HumanName("Alexey Richardovich Olurombi Akinwale", constants=self.C) + self.m(hn.first, "Alexey", hn) + self.m(hn.middle, "Richardovich", hn) + self.m(hn.last, "Olurombi Akinwale", hn) + + def test_russian_name_specific_order_with_african_origin(self): + hn = HumanName("Olurombi Akinwale Alexey Richardovich", constants=self.C) + self.m(hn.first, "Alexey", hn) + self.m(hn.middle, "Richardovich", hn) + self.m(hn.last, "Olurombi Akinwale", hn) + + def test_last_name_like_russian_patronymic(self): + hn = HumanName("Sergey Vitalyevich Petsevich", constants=self.C) + self.m(hn.first, "Sergey", hn) + self.m(hn.middle, "Vitalyevich", hn) + self.m(hn.last, "Petsevich", hn) + + def test_last_name_like_russian_patronymic_specific_order(self): + hn = HumanName("Petsevich Sergey Vitalyevich", constants=self.C) + self.m(hn.first, "Sergey", hn) + self.m(hn.middle, "Vitalyevich", hn) + self.m(hn.last, "Petsevich", hn) + + def test_turkic_patronymic(self): + hn = HumanName("Leyla Said Gyzy Ahmedova", constants=self.C) + self.m(hn.first, "Leyla", hn) + self.m(hn.middle, "Said Gyzy", hn) + self.m(hn.last, "Ahmedova", hn) + + def test_turkic_patronymic_specific_order(self): + hn = HumanName("Ahmedova Leyla Said Gyzy", constants=self.C) + self.m(hn.first, "Leyla", hn) + self.m(hn.middle, "Said Gyzy", hn) + self.m(hn.last, "Ahmedova", hn) + + # these surnames end with -y (-ый/-ий in Russian) which I would rather not add to the Russian last names endings list + # as the resulting regex would be too broad + # However, if the first name is followed by patronymic, it will be caught and parsed properly + # If it is transliterated as -yi/-yy/-iy/-ii instead of -y, it will also be recognized properly + # It's a shame the usual transliteration of -ый/-ий to English is -y (e.g. Sikorsky) + # I guess it follows the rules for similar last names in Polish language. + # Most popular endings for -y: -ский/-цкий (-sky/-tsky) are already covered, but corner cases like this one remain. + @unittest.expectedFailure + def test_tricky_case1(self): + hn = HumanName("Mogilny Alexander", constants=self.C) # famous hockey player + self.m(hn.first, "Alexander", hn) + self.m(hn.last, "Mogilny", hn) + + def test_tricky_case2(self): + hn = HumanName("Mogilny Alexander Gennadyevich", constants=self.C) # famous hockey player + self.m(hn.first, "Alexander", hn) + self.m(hn.middle, "Gennadyevich", hn) + self.m(hn.last, "Mogilny", hn) + + def test_tricky_case3(self): + hn = HumanName("Mogilnyy Alexander", constants=self.C) # famous hockey player + self.m(hn.first, "Alexander", hn) + self.m(hn.last, "Mogilnyy", hn) + + TEST_NAMES = ( "John Doe", "John Doe, Jr.", From 58a09502508e87655b02b128610877680accb198 Mon Sep 17 00:00:00 2001 From: Alexander Zarubkin Date: Mon, 27 May 2024 13:41:24 +0300 Subject: [PATCH 2/3] Add some checks against name components being None. Signed-off-by: Alexander Zarubkin --- nameparser/parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index c360847..ab50e61 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -581,12 +581,12 @@ def handle_russian_name_specifics(self): # OR if the last name looks like a russian patronymic # (but it will break on name without patronymic and foreign last name like Olurombi Alexey <- Last First order), # Another case: Last First instead of First Last. Then middle is empty. - is_name_order_lfm = self.is_russian_last_name(self.first) or ( + is_name_order_lfm = (self.first and self.is_russian_last_name(self.first)) or ( # if the middle name also looks like a russian patronymic, then it's a First Middle Last order, # e.g. Roman Alexeevich Abramovich <- Abramovich does look like patronymic, but it's really a last name - self.is_russian_patronymic(self.last) and not self.is_russian_patronymic(self.middle) + self.last and self.is_russian_patronymic(self.last) and not self.is_russian_patronymic(self.middle) ) or ( # some Russian citizens have patronymics of turkic origin, e.g. Said Ogly - self.is_turkic_patronymic(self.last) + self.last and self.is_turkic_patronymic(self.last) ) # rare case: last name consists of two or more words separated by space From b403d9adc6947ef976a73f1c56fbec6a2ee9aa50 Mon Sep 17 00:00:00 2001 From: Alexander Zarubkin Date: Tue, 28 May 2024 09:40:12 +0300 Subject: [PATCH 3/3] Add Fokich patronymic. Signed-off-by: Alexander Zarubkin --- nameparser/config/regexes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index be48229..4ba7155 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -33,8 +33,8 @@ ("phd", re.compile(r'\s(ph\.?\s+d\.?)', re.I | re.U)), ("russian_last_name_endings", re.compile(r'^.+(ov|ova|ev|eva|yov|yova|in|yn|ina|sky|skaya|ich|ych|uk|yuk|yk|ko|ak|ukh|ykh|ikh|chuk|yy|yi|oy|oi|iy|ii)$', re.I | re.U)), ("russian_last_name_endings_cyrillic", re.compile(r'^.+(ов|ова|ев|ева|ёв|ёва|ин|ын|ина|ский|ская|цкая|цкий|ич|ыч|ук|юк|ык|ко|ак|ух|ых|их|чук|ый|ой|ий)$', re.I | re.U)), - ("russian_patronymic_endings", re.compile(r'^(.+(ovich|ovna|evich|evna|ichna))|(ilyich|kuzmich|lukich|fomich)$', re.I | re.U)), - ("russian_patronymic_endings_cyrillic", re.compile(r'^(.+(ович|овна|евич|евна|ична))|(ильич|кузьмич|лукич|фомич)$', re.I | re.U)), + ("russian_patronymic_endings", re.compile(r'^(.+(ovich|ovna|evich|evna|ichna))|(ilyich|kuzmich|lukich|fomich|fokich)$', re.I | re.U)), + ("russian_patronymic_endings_cyrillic", re.compile(r'^(.+(ович|овна|евич|евна|ична))|(ильич|кузьмич|лукич|фомич|фокич)$', re.I | re.U)), ("turkic_patronymic_suffixes", re.compile(r'^(oglu|ogly|qizi|kizi|kyzy|gyzy|uly|uulu)$', re.I | re.U)), ("turkic_patronymic_suffixes_cyrillic", re.compile(r'^(оглу|оглы|кызы|гызы|улы|уулу)$', re.I | re.U)), ])