Skip to content

Code 2 #8

@stestagg

Description

@stestagg
import json
import operator
from collections import defaultdict

from detect import get_top_bigrams, parse_sample


def check_common_bigrams(bigrams, bigram_data):
    langs = {}
    for lang, lang_bigram in bigram_data.items():
        match = 0
        for bigram in bigrams:
            if bigram in lang_bigram:
                match += 1
        langs[lang] = match
    return langs


def main(sample):
    counts = defaultdict(int)
    parse_sample(counts, text=sample)
    bigrams = get_top_bigrams(counts)
    trained = json.load(open('output.json'))

    results = (check_common_bigrams(bigrams, trained))
    print(sorted(results.items(), key=operator.itemgetter(1), reverse=True)[0], repr(sample))


if __name__ == "__main__":
    for line in open("test_100.json"):
        main(json.loads(line)['text'])

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions