In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
df = pd.read_csv("./data/sample_submission.csv")
In [4]:
df.head()
Out[4]:
id | target | |
---|---|---|
0 | 0000.jpg | human |
1 | 0001.jpg | human |
2 | 0002.jpg | human |
3 | 0003.jpg | human |
4 | 0004.jpg | human |
In [5]:
import os
import warnings
warnings.filterwarnings(action='ignore')
os.environ["CUDA_VISIBLE_DEVICES"] = "0"#GPU를 할당한다
In [6]:
from glob import glob
import PIL
from PIL import Image
In [7]:
#glob.glob 사용법
f = open("./data/train/helloworld1.txt",'w')
f.write("hello world1")
f.close()
f = open("./data/train/helloworld2.txt",'w')
f.write("hello world2")
f.close()
#먼저 txt 파일을 생성하고 그 안에 내용을 기입했다.
In [8]:
path = "./data/train/"
output = glob("./data/train/*")
print(output)
print()
#이렇게 glob함수를 이용하면 *는 임의 길이의 문자열을
#의미하므로 모든 파일이 나온다.
#만약 path 내에 txt파일이 존재한다는 가정 하에(위에서 만들었음)
#output = glob("./data/train/*.txt")라고 입력하면
#path 내에 존재하는 txt파일이 리스트에 들어간다
output_txt = glob("./data/train/*.txt")
print(output_txt)
print()
#glob에서 '?'는 한자리의 문자열을 의미한다.
for filename in glob("./data/train/helloworld?.txt"):
f = open(filename, 'r')
print(f.readline())#파일내용읽기
f.close()
os.remove(filename)#파일 지우기
['./data/train\\airplane', './data/train\\automobile', './data/train\\bird', './data/train\\cat', './data/train\\deer', './data/train\\dog', './data/train\\frog', './data/train\\helloworld1.txt', './data/train\\helloworld2.txt', './data/train\\horse', './data/train\\ship', './data/train\\truck'] ['./data/train\\helloworld1.txt', './data/train\\helloworld2.txt'] hello world1 hello world2
In [9]:
test = PIL.Image.open("./data/train/airplane/0000.jpg")
#PIL 이미지 파일을 읽어들여서 test에 저장
from IPython.display import Image as ig
ig("./data/train/airplane/0000.jpg", width = 200)#이렇게 생겼다.
Out[9]:
In [10]:
test_list = np.array(test)#PIL이미지 파일을 np 어레이로 변환하면
In [11]:
print(test_list)#이렇게 rgb 데이터에 맞춰서 array가 생성되고
[[[200 202 197] [202 204 199] [203 205 200] ... [205 206 201] [202 203 198] [201 202 197]] [[210 212 207] [207 209 204] [208 210 205] ... [209 210 205] [207 208 203] [206 207 202]] [[212 214 211] [208 210 207] [212 214 211] ... [213 214 209] [211 212 207] [210 211 206]] ... [[219 219 221] [213 213 215] [193 193 195] ... [240 240 242] [238 238 240] [236 236 238]] [[212 212 214] [217 217 219] [217 217 219] ... [247 247 249] [244 244 246] [237 237 239]] [[221 221 223] [221 221 223] [220 220 222] ... [238 238 240] [241 241 243] [236 236 238]]]
In [12]:
test_list.shape#이것의 shape를 보면 (32,32,3)으로
#32행 3열의 면이 32개 있다고 해석할 수 있다.
Out[12]:
(32, 32, 3)
In [13]:
test_list_2 = [
[1,2,3,4],
[1,2,3,4]
]
test_list_2 = np.array(test_list_2)
test_list_2.shape
#2행 4열
Out[13]:
(2, 4)
In [14]:
test_list_3 = [[
[1,2,3,4],
[1,2,3,4]
]]
test_list_3 = np.array(test_list_3)
test_list_3.shape
#2행 4열 면 1개
Out[14]:
(1, 2, 4)
In [23]:
#training dataset 만들기
path = "./data/train/"
training_images = []
training_labels = []
for filename in glob(path +"*"):#glob 사용해서 /datas/train/안에 있는
#모든 directory들을 받는다.
for img in glob(filename + "/*.jpg"): #각각의 디렉토리 안에있는 이미지
an_img = PIL.Image.open(img) #read img
img_array = np.array(an_img) #img to array
training_images.append(img_array) #append array to training_images
label = filename.split('\\')[1] #get label
training_labels.append(label) #append label
training_images = np.array(training_images)#numpyarray로 변환한다.
training_labels = np.array(training_labels)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
training_labels = le.fit_transform(training_labels)
training_labels = training_labels.reshape(-1,1)
#위의 Label Encoder를 활용하면 기존의 label ['airplane','automobile'...]
#을 수치형 자료로 변환시킨다.
print(training_images.shape)
print(training_labels.shape)
(50000, 32, 32, 3) (50000, 1)
In [35]:
training_labels[40999]
Out[35]:
array([8], dtype=int64)
In [19]:
path = './data/test/'
a = glob(path + '*.jpg')[0]
a.split('\\')[1]
Out[19]:
'0000.jpg'
In [21]:
#create test dataset
path = './data/test/'
test_images = []
test_idx = []
flist = sorted(glob(path + '*.jpg'))
#flist에 './data/test/~~jpg'파일을 가져와 sort한 리스트를 대입시킨다.
for filename in flist:
an_img = PIL.Image.open(filename)#read img
img_array = np.array(an_img)#img to array
test_images.append(img_array) #append array to training_images
label = filename.split('\\')[1] #id값 가져오기
test_idx.append(label)
test_images = np.array(test_images)
print(test_images.shape)
print(test_idx[0:5])
(10000, 32, 32, 3) ['0000.jpg', '0001.jpg', '0002.jpg', '0003.jpg', '0004.jpg']
In [28]:
%matplotlib inline
#notebook을 실행한 브라우저에서 바로 그림을 볼 수 있게 해준것.
for i in range(10):
plt.subplot(2,5,i+1)
#여러개의 그림을 그리겠다. 2행 5열 index가 하나씩 증가하여 위치 지정
plt.imshow(training_images[i])
#plt의 이미지 표시 기능
print(training_labels[i], end=",")
plt.show()
[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],
In [44]:
labels = glob('./data/train/*')
for label in labels:
print(label.split("\\")[1])
#label 순서 label = [0]이면 airplane이된다.
airplane automobile bird cat deer dog frog horse ship truck
In [45]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(training_images, training_labels, test_size = 0.2, random_state=42)
X_test = test_images
In [46]:
print('X_train 크기:',X_train.shape)
print('X_valid 크기:',X_valid.shape)
print('X_test 크기:',X_test.shape)
X_train 크기: (40000, 32, 32, 3) X_valid 크기: (10000, 32, 32, 3) X_test 크기: (10000, 32, 32, 3)
In [50]:
print(X_train[0][0])
#image의 RGB 채널이 0~255까지의 값을 가지므로 normalization을 해주자
X_train = X_train / 255.0
X_valid = X_valid / 255.0
X_test = X_test / 255.0
[[193 220 241] [199 226 247] [202 229 250] [201 228 249] [201 228 249] [204 231 252] [204 231 252] [203 228 250] [203 228 250] [203 228 250] [203 228 250] [203 228 250] [203 228 250] [203 228 250] [203 228 250] [205 227 250] [198 229 250] [201 229 251] [202 228 251] [179 199 224] [131 147 173] [104 116 142] [107 114 142] [111 116 145] [103 109 135] [130 137 163] [171 183 207] [202 220 242] [209 232 250] [201 228 245] [197 229 244] [201 234 249]]
In [52]:
%pip install tensorflow
import tensorflow as tf
model = tf.keras.models.Sequential([
tf.keras.layers.Conv2D(16, (3,3), activation='relu', padding='SAME',input_shape=(32, 32, 3)), #cnn layer
tf.keras.layers.MaxPooling2D(2, 2, padding='SAME'), #pooling layer
tf.keras.layers.Dropout(0.23), # 무작위로 퍼셉트론 비활성화(23%)
tf.keras.layers.Conv2D(32, (3,3), activation='relu', padding='SAME'),
tf.keras.layers.MaxPooling2D(2, 2, padding='SAME'),
tf.keras.layers.Dropout(0.23),
tf.keras.layers.Conv2D(32, (3,3), activation='relu', padding='SAME'),
tf.keras.layers.MaxPooling2D(2, 2, padding='SAME'),
tf.keras.layers.Dropout(0.23),
tf.keras.layers.Flatten(), # N차원 배열 -> 1차원 배열
tf.keras.layers.Dense(1024, activation = 'relu'), #fully connected layer
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(10, activation = 'softmax') # ouput layer
])
Collecting tensorflowNote: you may need to restart the kernel to use updated packages. Downloading tensorflow-2.8.0-cp39-cp39-win_amd64.whl (438.0 MB) Collecting libclang>=9.0.1 Downloading libclang-13.0.0-py2.py3-none-win_amd64.whl (13.9 MB) Collecting grpcio<2.0,>=1.24.3 Downloading grpcio-1.44.0-cp39-cp39-win_amd64.whl (3.4 MB) Collecting keras-preprocessing>=1.1.1 Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB) Requirement already satisfied: h5py>=2.9.0 in c:\users\se99a\anaconda3\lib\site-packages (from tensorflow) (3.2.1) Requirement already satisfied: setuptools in c:\users\se99a\anaconda3\lib\site-packages (from tensorflow) (58.0.4) Requirement already satisfied: wrapt>=1.11.0 in c:\users\se99a\anaconda3\lib\site-packages (from tensorflow) (1.12.1) Collecting keras<2.9,>=2.8.0rc0 Downloading keras-2.8.0-py2.py3-none-any.whl (1.4 MB) Requirement already satisfied: typing-extensions>=3.6.6 in c:\users\se99a\anaconda3\lib\site-packages (from tensorflow) (3.10.0.2) Collecting opt-einsum>=2.3.2 Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB) Collecting astunparse>=1.6.0 Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB) Collecting google-pasta>=0.1.1 Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB) Collecting protobuf>=3.9.2 Downloading protobuf-3.19.4-cp39-cp39-win_amd64.whl (895 kB) Collecting tf-estimator-nightly==2.8.0.dev2021122109 Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB) Collecting tensorflow-io-gcs-filesystem>=0.23.1 Downloading tensorflow_io_gcs_filesystem-0.24.0-cp39-cp39-win_amd64.whl (1.5 MB) Collecting tensorboard<2.9,>=2.8 Downloading tensorboard-2.8.0-py3-none-any.whl (5.8 MB) Requirement already satisfied: six>=1.12.0 in c:\users\se99a\anaconda3\lib\site-packages (from tensorflow) (1.16.0) Collecting flatbuffers>=1.12 Downloading flatbuffers-2.0-py2.py3-none-any.whl (26 kB) Collecting termcolor>=1.1.0 Downloading termcolor-1.1.0.tar.gz (3.9 kB) Collecting gast>=0.2.1 Downloading gast-0.5.3-py3-none-any.whl (19 kB) Requirement already satisfied: numpy>=1.20 in c:\users\se99a\anaconda3\lib\site-packages (from tensorflow) (1.20.3) Collecting absl-py>=0.4.0 Downloading absl_py-1.0.0-py3-none-any.whl (126 kB) Requirement already satisfied: wheel<1.0,>=0.23.0 in c:\users\se99a\anaconda3\lib\site-packages (from astunparse>=1.6.0->tensorflow) (0.37.0) Requirement already satisfied: requests<3,>=2.21.0 in c:\users\se99a\anaconda3\lib\site-packages (from tensorboard<2.9,>=2.8->tensorflow) (2.26.0) Collecting tensorboard-data-server<0.7.0,>=0.6.0 Downloading tensorboard_data_server-0.6.1-py3-none-any.whl (2.4 kB) Collecting google-auth<3,>=1.6.3 Downloading google_auth-2.6.0-py2.py3-none-any.whl (156 kB) Requirement already satisfied: werkzeug>=0.11.15 in c:\users\se99a\anaconda3\lib\site-packages (from tensorboard<2.9,>=2.8->tensorflow) (2.0.2) Collecting tensorboard-plugin-wit>=1.6.0 Downloading tensorboard_plugin_wit-1.8.1-py3-none-any.whl (781 kB) Collecting google-auth-oauthlib<0.5,>=0.4.1 Downloading google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB) Collecting markdown>=2.6.8 Downloading Markdown-3.3.6-py3-none-any.whl (97 kB) Collecting cachetools<6.0,>=2.0.0 Downloading cachetools-5.0.0-py3-none-any.whl (9.1 kB) Collecting rsa<5,>=3.1.4 Downloading rsa-4.8-py3-none-any.whl (39 kB) Collecting pyasn1-modules>=0.2.1 Downloading pyasn1_modules-0.2.8-py2.py3-none-any.whl (155 kB) Collecting requests-oauthlib>=0.7.0 Downloading requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB) Requirement already satisfied: importlib-metadata>=4.4 in c:\users\se99a\anaconda3\lib\site-packages (from markdown>=2.6.8->tensorboard<2.9,>=2.8->tensorflow) (4.8.1) Requirement already satisfied: zipp>=0.5 in c:\users\se99a\anaconda3\lib\site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard<2.9,>=2.8->tensorflow) (3.6.0) Collecting pyasn1<0.5.0,>=0.4.6 Downloading pyasn1-0.4.8-py2.py3-none-any.whl (77 kB) Requirement already satisfied: idna<4,>=2.5 in c:\users\se99a\anaconda3\lib\site-packages (from requests<3,>=2.21.0->tensorboard<2.9,>=2.8->tensorflow) (3.2) Requirement already satisfied: certifi>=2017.4.17 in c:\users\se99a\anaconda3\lib\site-packages (from requests<3,>=2.21.0->tensorboard<2.9,>=2.8->tensorflow) (2021.10.8) Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\se99a\anaconda3\lib\site-packages (from requests<3,>=2.21.0->tensorboard<2.9,>=2.8->tensorflow) (1.26.7) Requirement already satisfied: charset-normalizer~=2.0.0 in c:\users\se99a\anaconda3\lib\site-packages (from requests<3,>=2.21.0->tensorboard<2.9,>=2.8->tensorflow) (2.0.4) Collecting oauthlib>=3.0.0 Downloading oauthlib-3.2.0-py3-none-any.whl (151 kB) Building wheels for collected packages: termcolor Building wheel for termcolor (setup.py): started Building wheel for termcolor (setup.py): finished with status 'done' Created wheel for termcolor: filename=termcolor-1.1.0-py3-none-any.whl size=4847 sha256=c791d3c7bf137c1ad0ab3c951e71fc6ee8c13ed853af48a8cec60f2961d32f6f Stored in directory: c:\users\se99a\appdata\local\pip\cache\wheels\b6\0d\90\0d1bbd99855f99cb2f6c2e5ff96f8023fad8ec367695f7d72d Successfully built termcolor Installing collected packages: pyasn1, rsa, pyasn1-modules, oauthlib, cachetools, requests-oauthlib, google-auth, tensorboard-plugin-wit, tensorboard-data-server, protobuf, markdown, grpcio, google-auth-oauthlib, absl-py, tf-estimator-nightly, termcolor, tensorflow-io-gcs-filesystem, tensorboard, opt-einsum, libclang, keras-preprocessing, keras, google-pasta, gast, flatbuffers, astunparse, tensorflow Successfully installed absl-py-1.0.0 astunparse-1.6.3 cachetools-5.0.0 flatbuffers-2.0 gast-0.5.3 google-auth-2.6.0 google-auth-oauthlib-0.4.6 google-pasta-0.2.0 grpcio-1.44.0 keras-2.8.0 keras-preprocessing-1.1.2 libclang-13.0.0 markdown-3.3.6 oauthlib-3.2.0 opt-einsum-3.3.0 protobuf-3.19.4 pyasn1-0.4.8 pyasn1-modules-0.2.8 requests-oauthlib-1.3.1 rsa-4.8 tensorboard-2.8.0 tensorboard-data-server-0.6.1 tensorboard-plugin-wit-1.8.1 tensorflow-2.8.0 tensorflow-io-gcs-filesystem-0.24.0 termcolor-1.1.0 tf-estimator-nightly-2.8.0.dev2021122109
In [53]:
model.compile(optimizer='adam', loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])
In [54]:
model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= conv2d (Conv2D) (None, 32, 32, 16) 448 max_pooling2d (MaxPooling2D (None, 16, 16, 16) 0 ) dropout (Dropout) (None, 16, 16, 16) 0 conv2d_1 (Conv2D) (None, 16, 16, 32) 4640 max_pooling2d_1 (MaxPooling (None, 8, 8, 32) 0 2D) dropout_1 (Dropout) (None, 8, 8, 32) 0 conv2d_2 (Conv2D) (None, 8, 8, 32) 9248 max_pooling2d_2 (MaxPooling (None, 4, 4, 32) 0 2D) dropout_2 (Dropout) (None, 4, 4, 32) 0 flatten (Flatten) (None, 512) 0 dense (Dense) (None, 1024) 525312 dropout_3 (Dropout) (None, 1024) 0 dense_1 (Dense) (None, 10) 10250 ================================================================= Total params: 549,898 Trainable params: 549,898 Non-trainable params: 0 _________________________________________________________________
In [55]:
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=10, batch_size=128)
Epoch 1/10 313/313 [==============================] - 53s 163ms/step - loss: 1.8203 - accuracy: 0.3315 - val_loss: 1.5690 - val_accuracy: 0.4469 Epoch 2/10 313/313 [==============================] - 37s 118ms/step - loss: 1.5089 - accuracy: 0.4493 - val_loss: 1.3913 - val_accuracy: 0.5027 Epoch 3/10 313/313 [==============================] - 37s 117ms/step - loss: 1.3939 - accuracy: 0.4949 - val_loss: 1.2903 - val_accuracy: 0.5370 Epoch 4/10 313/313 [==============================] - 36s 116ms/step - loss: 1.3065 - accuracy: 0.5311 - val_loss: 1.2649 - val_accuracy: 0.5467 Epoch 5/10 313/313 [==============================] - 38s 123ms/step - loss: 1.2447 - accuracy: 0.5550 - val_loss: 1.1363 - val_accuracy: 0.6012 Epoch 6/10 313/313 [==============================] - 36s 116ms/step - loss: 1.2028 - accuracy: 0.5674 - val_loss: 1.1101 - val_accuracy: 0.6139 Epoch 7/10 313/313 [==============================] - 46s 148ms/step - loss: 1.1555 - accuracy: 0.5876 - val_loss: 1.0877 - val_accuracy: 0.6242 Epoch 8/10 313/313 [==============================] - 44s 139ms/step - loss: 1.1178 - accuracy: 0.6009 - val_loss: 1.0957 - val_accuracy: 0.6059 Epoch 9/10 313/313 [==============================] - 50s 159ms/step - loss: 1.0842 - accuracy: 0.6112 - val_loss: 0.9869 - val_accuracy: 0.6575 Epoch 10/10 313/313 [==============================] - 44s 140ms/step - loss: 1.0648 - accuracy: 0.6214 - val_loss: 0.9949 - val_accuracy: 0.6558
Out[55]:
<keras.callbacks.History at 0x184d0946f40>
In [56]:
pred_proba = model.predict(X_test)
pred_class = []
for i in pred_proba:
pred = np.argmax(i)
pred_class.append(pred)
pred_class = le.inverse_transform(pred_class)
pred_class[0:5]
Out[56]:
array(['horse', 'bird', 'truck', 'dog', 'bird'], dtype='<U10')
In [66]:
for i in range(5):
plt.subplot(1,5,i+1)
plt.imshow(X_train[i])
plt.show()
출처 : https://dacon.io/competitions/official/235874/codeshare/4548?page=1&dtype=recent Dacon에서 사물 이미지 분류 경진대회를 하는데, 필자는 CNN에 대한 개념이나, tensorflow에 대한 개념이 익숙치 않았다. 내용에 대한 정확한 이해는 부족했지만 따라하는 것 만으로 CNN모델의 흐름이 어떤 식으로 진행되고 실제로 어떻게 사용되는지 알기 좋았다. 이후에는 관련한 개념에 대해 알아봐야겠다.
In [ ]:
'데이터 시각화 분석' 카테고리의 다른 글
취미로 하는 데이터 분석 시리즈07(Python Selenium으로 다음 인기 검색어 가져오기, 다음 로그인) (0) | 2022.03.05 |
---|---|
취미로 하는 데이터 분석 시리즈06(웹사이트 크롤링 어플리케이션) (0) | 2022.03.05 |
취미로 하는 데이터 분석 시리즈04-2(기원후 1000년에 와인을 만들었다면 그 가격은 얼마일까?) (0) | 2022.03.01 |
취미로 하는 데이터 분석 시리즈04-1(와인 가격 데이터 분석) (0) | 2022.02.28 |
취미로 하는 데이터 분석 시리즈03(Instagram 팔로워 수 데이터 분석) (0) | 2022.02.23 |