福くんと鈴木一真さんの画像分類問題をCNNで解く（データセット作成編）

これまではMNISTやIris等の既成のデータセットをもとにして問題を解いてきたが、機械学習で一番苦労する部分は「データセットを作るところ」と噂でよく聞くので、今回はデータセットを自分で作って分類問題を解いてみようと思う。

題材は何にすべきか、結構迷ったが、鈴木福くんと鈴木一真さんの画像をCNNに分類させてみることにした。下記にもあるように、福くんと鈴木一真さんは容姿が似ていることで有名なので、今回はこのお二方のサンプルを集めて分類させるところまでいきたい。
(参考:俳優・鈴木一真(44)と鈴木福くん(9)がソックリ過ぎる)

2クラス分類なので課題としては簡単。
この二人に似ている人が他にいれば教えてください。

データセット作成

肝はこのフェーズ。
福くんと鈴木一真さんの画像をそれぞれ集める。

今回はGoogle Custom Search APIを使った。
無料使用の範囲では、一日100クエリまでしか検索できないという制約があるが、今回はそこまで量を集めるわけではないので問題なかった。

インポートするライブラリ

import urllib.request
from urllib.parse import quote
import httplib2
import json 
import os
import time
import copy

API_KEY = "×××××××××××"
CUSTOM_SEARCH_ENGINE = "○○○○○○○○○○○"

Google Custom Search APIで画像URLを取得

# 画像URLを取得する
def getImageUrl(search_item, total_num):
    img_list = []
    i = 0

    while i < total_num:
        # クエリを作成
        query_img = "https://www.googleapis.com/customsearch/v1?key=" + API_KEY + "&cx=" + CUSTOM_SEARCH_ENGINE + "&num=" + str(10 if(total_num-i)>10 else (total_num-i)) + "&start=" + str(i+1) + "&q=" + quote(search_item) + "&searchType=image"
        res = urllib.request.urlopen(query_img)
        data = json.loads(res.read().decode('utf-8'))

        # URLを取得
        for j in range(len(data["items"])):
            img_list.append(data["items"][j]["link"])
        i=i+10
    return img_list

画像ダウンロード

# URLから画像をダウンロードする
def getImage(search_item, img_list):
    opener = urllib.request.build_opener()
    http = httplib2.Http(".cache")
    for i in range(len(img_list)):
        try:
            fn, ext = os.path.splitext(img_list[i])
            response, content = http.request(img_list[i])

            # ディレクトリがない場合、ディレクトリ作成
            if not (os.path.exists(search_item)):
                os.mkdir(search_item)
            
            # 画像保存
            with open(search_item+"/"+search_item+str(i)+ext, 'wb') as f:
                f.write(content)
        except:
            print("failed to download images.")
            continue

if __name__ == "__main__":
    search_word = "福くん"
    img_list = getImageUrl(search_word, 100)
    getImage(search_word, img_list)

顔部分のみ切り抜き

OpenCVのカスケード型分類器を使って、画像から顔部分を検出して切り抜く。切り抜いた後、128*128のサイズにリサイズしている。

import cv2
import time
import copy
from matplotlib import pyplot as plt

# 顔部分を切り抜く
def cropFace(src_name, dst_name, imsize):
    # Haarcascade分類器を読み込む
    path = os.environ['HOME'] + "/anaconda3/share/OpenCV/haarcascades/" + "haarcascade_frontalface_default.xml"
    face_cascade = cv2.CascadeClassifier(path) 

    # 切り抜いた画像を保存するディレクトリを作成
    dir = os.getcwd() + '/Crop_' + dst_name
    if not (os.path.exists(dir)):
        os.mkdir(dir)
    img_paths = os.listdir(os.getcwd() + "/" + src_name) 
    img_paths = [src_name + "/" + img_path  for img_path in img_paths]

    # 画像ごとにループ
    for img_path in img_paths:
        root, ext = os.path.splitext(img_path)
        if not ext in ['.jpg', '.jpeg', '.png', '.tiff', '.tif','.bmp', 'JPG', 'JPEG']:
            continue
        img = cv2.imread(img_path)
        gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) # grayに変換
        faces = face_cascade.detectMultiScale(gray, 1.3, 5)

        # 検出した顔ごとに切り抜き
        for num in range(len(faces)):
            cropImg = copy.deepcopy(img[faces[num][1]:faces[num][1]+faces[num][3], faces[num][0]:faces[num][0]+faces[num][2]])
            # 画像リサイズ
            resizeImg = cv2.resize(cropImg, (imsize, imsize)) 

            t = time.ctime().split(' ')
            if t.count('') == 1:
                t.pop(t.index(''))
            timestr = t[1] + t[2] + t[0] + '_' + t[4] + '_' + t[3].split(':')[0] + t[3].split(':')[1] + t[3].split(':')[2]
            filename = dir + "/" + dst_name + '_' + timestr + "_" + str(num + 1) + '.tif'
            cv2.imwrite(filename, resizeImg)

        # 顔検出を可視化して目視チェック
        for (x, y, w, h) in faces:
            cv2.rectangle(img, (x,y), (x+w, y+h), (255,0,0), 2)
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        plt.show()