lingua-py语种检测入门指南

安装

1
pip install lingua-language-detector  # 2.1.1

简单使用

1
2
3
4
5
6
7
8
9
10
11
12
13
from lingua import Language, LanguageDetectorBuilder

# 中文、英语、印尼语、马来语、他加禄语(菲)、印地语、泰语、越南语、阿拉伯语、土耳其语
languages = [Language.CHINESE, Language.ENGLISH, Language.INDONESIAN, Language.MALAY, Language.TAGALOG, Language.HINDI, Language.THAI, Language.VIETNAMESE, Language.ARABIC, Language.TURKISH]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

language = detector.detect_language_of("languages are awesome")
language # Language.ENGLISH

language.iso_code_639_1 # IsoCode639_1.EN
language.iso_code_639_1.name # EN
language.iso_code_639_3 # IsoCode639_3.ENG
language.iso_code_639_3.name # ENG
  • 各语种检测示例

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    detector.detect_language_of('语言的魅力')
    detector.detect_language_of("languages are awesome")

    detector.detect_language_of('kontol maaf tai ga nanya') # Language.INDONESIAN
    detector.detect_language_of('sahu randi aao na') # Language.MALAY
    detector.detect_language_of('Magandang umaga') # Language.TAGALOG

    detector.detect_language_of('कयों') # Language.HINDI
    detector.detect_language_of('ยินดีที่ได้รู้จัก') # Language.THAI
    detector.detect_language_of('Ông từ đâu đến?') # Language.VIETNAMESE

    detector.detect_language_of('سّيد') # Language.ARABIC
    detector.detect_language_of('İyi akşamlar.') # Language.TURKISH

    detector.detect_language_of('😭🧖🧖..[];') # None
  • 检测错误的badcases

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    detector.detect_language_of('cwek sangean pap')   # 检测结果是Language.ENGLISH,实际应该是Language.INDONESIAN


    # 输出概率分布
    confidence_values = detector.compute_language_confidence_values('cwek sangean pap')
    for confidence in confidence_values[:3]:
    print(f"{confidence.language.name}: {confidence.value:.2f}")
    # ENGLISH: 0.31
    # INDONESIAN: 0.29
    # TAGALOG: 0.23


    # 仅返回一种特定语言的置信度值
    detector.compute_language_confidence("cwek sangean pap", Language.INDONESIAN) # 0.290841954706363

参考资料