GCPのCloud Vision APIを利用してOCRを使い、画像の位置から文字列を取得

Cloud Vision APIを利用してOCRを使ってみました。日本語でもかなりの精度に驚いています。

ただし文章とみなされる部分に関しては文字列として判断されますが、空白が空いていたりすると文章として取得できないケースもあります。

そんな時には位置情報から取得する内容を作ることで、ブロックごとのデータ取得ができます。

コード

import io
from google.cloud import vision
import xml.etree.ElementTree as ET

def read_ocr(input_file):
    client = vision.ImageAnnotatorClient()
    with io.open(input_file, 'rb') as image_file:
        content = image_file.read()
    image = vision.Image(content=content)
    response = client.document_text_detection(image=image)

    return response

def card_template(input_xml):
    tree = ET.parse(input_xml) # input_xmlはxmlのパス
    root = tree.getroot()
    return root

def card_analytics(response, card_template, height):
    size = card_template.find("./size")
    ratio = 1.0
    org_heigit = int(size.find('height').text)
    if org_heigit > 0 and height > 0:
        ratio = height / org_heigit

    # テンプレートマッチング
    text_infos = []
    document = response.full_text_annotation
    for page in document.pages:
        for block in page.blocks:
            for paragraph in block.paragraphs:
                for word in paragraph.words:
                    for symbol in word.symbols:
                        bounding_box = symbol.bounding_box
                        xmin = bounding_box.vertices[0].x
                        ymin = bounding_box.vertices[0].y
                        xmax = bounding_box.vertices[2].x
                        ymax = bounding_box.vertices[2].y
                        xcenter = (xmin + xmax) / 2
                        ycenter = (ymin + ymax) / 2
                        text = symbol.text
                        text_block += symbol.text
                        text_infos.append([text, xcenter, ycenter])

    result_dict = {}
    for obj in card_template.findall("./object"):
        name = obj.find('name').text
        xmin = obj.find('bndbox').find('xmin').text
        ymin = obj.find('bndbox').find('ymin').text
        xmax = obj.find('bndbox').find('xmax').text
        ymax = obj.find('bndbox').find('ymax').text
        xmin, ymin, xmax, ymax = int(xmin), int(ymin), int(xmax), int(ymax)
        ymin = int(ymin * ratio)
        ymax = int(ymax * ratio)

        texts = ''
        for text_info in text_infos:
            text = text_info[0]
            xcenter = text_info[1]
            ycenter = text_info[2]
            if xmin <= xcenter <= xmax and ymin <= ycenter <= ymax:
                texts += text
        result_dict[name] = texts

    return result_dict

response = gcp_read.read_ocr("sample.jpg")

# テンプレートを取得
card_template = gcp_read.card_template("card.xml")
result = gcp_read.card_analytics(response, card_template, 640)

<annotation>
    <folder>Downloads</folder>
    <filename>card.jpg</filename>
    <path>/</path>
    <source>
        <database>Unknown</database>
    </source>
    <size>
        <width>1000</width>
        <height>640</height>
        <depth>3</depth>
    </size>
    <segmented>0</segmented>
    <object>
        <name>name</name>
        <pose>Unspecified</pose>
        <truncated>0</truncated>
        <difficult>0</difficult>
        <bndbox>
            <xmin>110</xmin>
            <ymin>35</ymin>
            <xmax>610</xmax>
            <ymax>95</ymax>
        </bndbox>
    </object>
    <object>
        <name>birthday</name>
        <pose>Unspecified</pose>
        <truncated>0</truncated>
        <difficult>0</difficult>
        <bndbox>
            <xmin>610</xmin>
            <ymin>35</ymin>
            <xmax>950</xmax>
            <ymax>95</ymax>
        </bndbox>
    </object>
</annotation>