
PHPer, Golang, AWS エンジニアの日々

GCPのCloud Vision APIを利用してOCRを使い、画像の位置から文字列を取得

Cloud Vision APIを利用してOCRを使ってみました。日本語でもかなりの精度に驚いています。




import io
from google.cloud import vision
import xml.etree.ElementTree as ET

def read_ocr(input_file):
    client = vision.ImageAnnotatorClient()
    with io.open(input_file, 'rb') as image_file:
        content = image_file.read()
    image = vision.Image(content=content)
    response = client.document_text_detection(image=image)

    return response

def card_template(input_xml):
    tree = ET.parse(input_xml) # input_xmlはxmlのパス
    root = tree.getroot()
    return root

def card_analytics(response, card_template, height):
    size = card_template.find("./size")
    ratio = 1.0
    org_heigit = int(size.find('height').text)
    if org_heigit > 0 and height > 0:
        ratio = height / org_heigit

    # テンプレートマッチング
    text_infos = []
    document = response.full_text_annotation
    for page in document.pages:
        for block in page.blocks:
            for paragraph in block.paragraphs:
                for word in paragraph.words:
                    for symbol in word.symbols:
                        bounding_box = symbol.bounding_box
                        xmin = bounding_box.vertices[0].x
                        ymin = bounding_box.vertices[0].y
                        xmax = bounding_box.vertices[2].x
                        ymax = bounding_box.vertices[2].y
                        xcenter = (xmin + xmax) / 2
                        ycenter = (ymin + ymax) / 2
                        text = symbol.text
                        text_block += symbol.text
                        text_infos.append([text, xcenter, ycenter])

    result_dict = {}
    for obj in card_template.findall("./object"):
        name = obj.find('name').text
        xmin = obj.find('bndbox').find('xmin').text
        ymin = obj.find('bndbox').find('ymin').text
        xmax = obj.find('bndbox').find('xmax').text
        ymax = obj.find('bndbox').find('ymax').text
        xmin, ymin, xmax, ymax = int(xmin), int(ymin), int(xmax), int(ymax)
        ymin = int(ymin * ratio)
        ymax = int(ymax * ratio)

        texts = ''
        for text_info in text_infos:
            text = text_info[0]
            xcenter = text_info[1]
            ycenter = text_info[2]
            if xmin <= xcenter <= xmax and ymin <= ycenter <= ymax:
                texts += text
        result_dict[name] = texts

    return result_dict

response = gcp_read.read_ocr("sample.jpg")

# テンプレートを取得
card_template = gcp_read.card_template("card.xml")
result = gcp_read.card_analytics(response, card_template, 640)




