본문 바로가기

프로그래밍

셀레니움, Kobart-summarization을 이용한 나무위키 목차별 요약

import torch
from transformers import PreTrainedTokenizerFast
from transformers import BartForConditionalGeneration

def model_load(model_name):
    tokenizer = PreTrainedTokenizerFast.from_pretrained(f"{model_name}/kobart-summarization")
    model = BartForConditionalGeneration.from_pretrained(f"{model_name}/kobart-summarization")
    return model, tokenizer

def preprocessing(text):
    text = text.replace('[[0-9]]', '')
    text = text.replace('\n', ' ')
    text = text.replace('\'', '')
    text = text.replace('[', '')
    text = text.replace(']', '')
    text = text[:1024]
    return text

def summarizing(text, model, tokenizer):
    if len(text) < 20:
        return text
    raw_input_ids = tokenizer.encode(text)
    input_ids = [tokenizer.bos_token_id] + raw_input_ids + [tokenizer.eos_token_id]
    summary_ids = model.generate(torch.tensor([input_ids]),  num_beams=4,  max_length=512,  eos_token_id=1)
    summarized = tokenizer.decode(summary_ids.squeeze().tolist(), skip_special_tokens=True)
    return summarized

 

 

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from Model import model_load
from Model import preprocessing
from Model import summarizing
import sys


def chromeWebDriver():
    chrome_service  = ChromeService(executable_path=ChromeDriverManager().install())
    options = Options()
    options.add_experimental_option('detach', True)
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    driver = webdriver.Chrome(service = chrome_service, options = options)
    return driver

def document_not_found(driver):
    if driver.find_element(By.TAG_NAME, 'p').text == "해당 문서를 찾을 수 없습니다.":
        print("실제 나무위키에 등록된 이름으로 검색해야 합니다.")
        driver.close()
        sys.exit()
        sys.exit("종료")

def textRetrieval(name):
    driver = chromeWebDriver()
    driver.get('https://namu.wiki/w/' + name)
    document_not_found(driver)
    driver.implicitly_wait(2)
    driver.maximize_window()
    index_list = driver.find_element(By.CLASS_NAME, '_0yPpiNcJ').text.split('\n') #목차
    text_list= driver.find_elements(By.CLASS_NAME, 'qp2Iq0Hi') #내용
    text_list = [i.text for i in text_list]
    driver.close()

    index_text = {}
    for i in range(len(index_list)):
        index_text[index_list[i]] = text_list[i]

    return index_list, text_list, index_text #목차와 그에 상응하는 텍스트값 리스트 및 딕셔너리 형태로 반환


def run_separate(name, model, tokenizer):
    index_list, text_list, index_text = textRetrieval(name)
    print(index_list)
    choose = input("요약을 원하는 인덱스 번호를 입력 >> EX) '1.' '3.2.' ")

    for i in range(len(index_list)):
        if index_list[i].startswith(choose, ):
            key = index_list[i]
            print('선택한 목차 >> ',index_list[i])
            break
    text = index_text[key]
    text = preprocessing(text)
    print(f"원본문서 >> \n{text}")
    #요약
    summarized = summarizing(text, model, tokenizer)
    print(f"요약 결과 >> \n{summarized}")

def run_totally(name, model, tokenizer):
    index_list, text_list, index_text = textRetrieval(name)
    for i in range(len(index_list)):
        print(index_list[i],summarizing(preprocessing(text_list[i]),model,tokenizer),sep='\n')


if __name__ == '__main__':
    name = '지능' #검색어                                                                                                     '
    model, tokenizer = model_load('gogamza')
    #run_separate(name, model, tokenizer)
    run_totally(name,model,tokenizer)

사용법 : main.py 파일을 실행시킬 것.

실행문에서 name 옆의 str로 검색이 가능.

run_seperate 함수는 요약하고자하는 목차 설정 후 요약

run_totally 함수는 목차별 요약 후 print

'프로그래밍' 카테고리의 다른 글

Bland's Rule(Simplex method)  (1) 2022.10.01