#!/usr/bin/env python3 # Johns Hopkins University (authors: Amir Hussein) """ Compute WER per language """ import sys, codecs, math, pickle, unicodedata, re from collections import Counter import argparse from kaldialign import align from collections import defaultdict def get_parser(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( "--rec", type=str, default="", help="Cut ref file", ) return parser lids = "en,zh" lids_dict = {lid:id+1 for id, lid in enumerate(lids.split(","))} id2lang = {id+1: lid for id, lid in enumerate(lids.split(","))} bad_id = [] def extract_info(line, info): # Split the line at the first colon to separate the ID id_part, rest = line.split(':', 1) # Extract 'ref' by finding its start and end ref_start = rest.find(info) ref_end = rest.find(']', ref_start) ref = rest[ref_start+len(info):ref_end].replace("'", "").split(', ') # Extract 'lid' if 'lid=' in rest: lid_start = rest.find('lid=[') lid_end = rest.find(']', lid_start) lid = rest[lid_start+len('lid=['):lid_end].split(', ') else: lid = [''] if lid[0]=='': bad_id.append(id_part) if " ".join(lid): lid = [int(i) for i in lid] # Convert each element to integer return id_part.strip(), ref, lid def is_English(c): """check character is in English""" return ord(c.lower()) >= ord("a") and ord(c.lower()) <= ord("z") def get_en(text): res = [] for w in text: if w: if is_English(w[0]): res.append(w) else: continue return res def get_zh(text): res = [] for w in text: if w: if is_English(w[0]): continue else: res.append(w) return res def extract_info_lid(line, tag): # Split the line at the first colon to separate the ID id_part, rest = line.split(':', 1) # Extract 'ref' by finding its start and end ref_start = rest.find(tag) ref_end = rest.find(']', ref_start) ref = rest[ref_start+len(tag):ref_end].replace("'", "").split(', ') return id_part.strip(), ref def align_lid2(labels_a, labels_b, a, b): # Alignment EPS = '*' ali = align(a, b, EPS, sclite_mode=True) a2idx = {(i,idx):j for idx,(i,j) in enumerate(zip(a,labels_a))} b2idx = {(i,idx):j for idx,(i,j) in enumerate(zip(b,labels_b))} # Comparing labels of aligned elements idx_a = 0 idx_b = 0 ali_idx=0 aligned_a = [] aligned_b = [] while idx_a