공지사항 크롤링 및 데이터 저장(4)

Posted Aug 19, 2024

By minnnning

7 min read

공지사항을 크롤링하기 하기위해 셀레니움을 사용하고 목록을 추출한다 각 공지사항의 세부내용도 필요하기 때문에 다시 한번 해당 url에 접근해서 세부 내용을 가져와 db에 저장한다 각 학과별 공지사항을 크롤링한다 모든 페이지들이 형식이 비슷하다면 쉽겠지만 모두 다르기 때문에 다른 코드를 작성해야한다 그래도 코드 중복을 최대한 줄이기 위해서 하나의 클래스를 생성하고 오버라이딩해서 코드를 작성했다

1. 공지사항을 크롤링 하기위한 기본 클래스 생성

  
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import time,json

with open('config.json', 'r') as config_file:
    config = json.load(config_file)
    driver_path = config['driver_path']

class NoticeScraper:
    def __init__(self, url, site, category, notice_list_selector, notice_contents_selector):
        self.url = url
        self.site = site
        self.category = category
        self.notice_list_selector = notice_list_selector
        self.notice_contents_selector = notice_contents_selector
        self.driver = self._init_driver()

    def _init_driver(self):
        firefox_options = Options()
        firefox_options.add_argument("--headless")
        firefox_options.add_argument("--no-sandbox")
        firefox_options.add_argument("--disable-dev-shm-usage")
        # Geckodriver 서비스 설정
        service = Service(executable_path=driver_path)
        # Firefox 웹 드라이버 인스턴스 생성
        driver = webdriver.Firefox(service=service, options=firefox_options)
        return driver

    def get_notice_list(self):
        self.driver.get(self.url)
        time.sleep(2)  # 페이지 로딩 대기

        list_items = self.driver.find_elements(By.CSS_SELECTOR, self.notice_list_selector)
        notices = []
        for index, item in enumerate(list_items):
            if index < 3:
                continue

            row = item.find_elements(By.TAG_NAME, "td")
            if not row:
                continue

            notice = {
                "site": self.site,
                "category": self.category,
                "title": row[2].text.strip(),
                "url": row[2].find_element(By.TAG_NAME, "a").get_attribute("href").strip(),
                "date": row[6].text.strip()
            }
            notices.append(notice)
        
        return notices

    def get_contents_html(self, url):
        self.driver.get(url)
        time.sleep(3)  # 페이지 로딩 대기

        contents_element = self.driver.find_element(By.CSS_SELECTOR, self.notice_contents_selector)
        return contents_element.get_attribute('outerHTML')

    def get_contents_text(self, url):
        html_content = self.get_contents_html(url)
        soup = BeautifulSoup(html_content, 'html.parser')
        return soup.get_text(separator="\n").strip()

    def close(self):
        self.driver.quit()

이렇게 기본 클래스를 생성하고 상황에 따라서 오버라이딩 해서 사용

2. 각 공지사항 페이지 정보

학과별로 공지사항이 저장된 url과 오버라이딩을 했다면 해당 클래스를 작성 그리고 이외 학과별로 다른 정보를 저장한다

  
class 정보통신공학부:
    # 정보통신공학부 공지사항 설정
    url = "https://inform.chungbuk.ac.kr/bbs/bbs.php?db=notice"
    site = "정보통신공학부"
    category = "공지사항"
    notice_list_selector = "#content1 > div.section.clear > table:nth-child(6) > tbody > tr"
    notice_contents_selector = "#articles"

3. 크롤링을 실행하는 main.py 작성

  
# main.py
from notice_scraper import NoticeScraper
from 전기공학부 import 전기공학부
from 전자공학부 import 전자공학부, ElectronicEngineeringNoticeScraper
from 정보통신공학부 import 정보통신공학부
from 컴퓨터공학과 import 컴퓨터공학과
from 지능로봇공학과 import 지능로봇공학과, IntelligentRoboticsNoticeScraper
from 반도체공학부 import 반도체공학부
from 소프트웨어학과 import 소프트웨어학과, SoftwareDepartmentNoticeScraper

import pymysql, json
# DB 설정 데이터 가져오기
with open('config.json', 'r') as config_file:
    config = json.load(config_file)
    driver_path = config['driver_path']
    hosturl = config['host']
    username = config['username']
    userpassword = config['password']
    dbname = config['db']

table_N = 'notice_board'

# MariaDB 연결
db_connection = pymysql.connect(host=hosturl, user=username, password=userpassword, db=dbname, charset='utf8')
cursor = db_connection.cursor()

def clean_text(text):
    """텍스트에서 특수 문자 및 불필요한 공백을 제거합니다."""
    if text:
        return text.replace("\n", " ").replace("\r", " ").replace("'", "\\'")
    return text

def is_duplicate(url):
    """데이터베이스에 이미 존재하는 url인지 확인합니다."""
    sql = f"SELECT COUNT(*) FROM {table_N} WHERE url = %s"
    cursor.execute(sql, (url,))
    result = cursor.fetchone()
    return result[0] > 0

def get_scraper(department):
    if department == 전자공학부:
        return ElectronicEngineeringNoticeScraper(
            department.url,
            department.site,
            department.category,
            department.notice_list_selector,
            department.notice_contents_selector
        )
    
    elif department == 지능로봇공학과 or department == 반도체공학부:
        return IntelligentRoboticsNoticeScraper(
            department.url,
            department.site,
            department.category,
            department.notice_list_selector,
            department.notice_contents_selector
        )
    
    elif department == 소프트웨어학과:
        return SoftwareDepartmentNoticeScraper(
            department.url,
            department.site,
            department.category,
            department.notice_list_selector,
            department.notice_contents_selector
        )

    else:
        return NoticeScraper(
            department.url,
            department.site,
            department.category,
            department.notice_list_selector,
            department.notice_contents_selector
        )

if __name__ == "__main__":
    # 각 학과 설정들을 리스트에 담습니다.
    departments = [전기공학부, 전자공학부, 정보통신공학부, 컴퓨터공학과, 지능로봇공학과, 반도체공학부, 소프트웨어학과]

    for department in departments:
        print(f"스크래핑 시작: {department.site}")

        # 각 학과에 맞는 스크래퍼 인스턴스를 생성합니다.
        scraper = get_scraper(department)

        # notice_list를 가져와서 출력합니다.
        notice_list = scraper.get_notice_list()
        for notice in notice_list:
            if is_duplicate(notice['url']):
                print(f"중복된 데이터, 건너뜀: {notice['url']}")
                continue
            
            contents_text = clean_text(scraper.get_contents_text(notice['url'])) # 내용까지 스크래핑하는 코드 추가
            try:
                # 데이터베이스에 저장
                sql = f"INSERT INTO {table_N} (title, content ,date, url, site, category) VALUES (%s, %s, %s, %s, %s, %s)"
                values = (notice['title'], contents_text, notice['date'], notice['url'], notice['site'], department.category)
                cursor.execute(sql, values)
                db_connection.commit()
                print(f"Data inserted successfully: title={notice['title']}, site={notice['site']}")

            except pymysql.Error as e:
                print(f"Error {e.args[0]}, {e.args[1]}")
                db_connection.rollback()

        scraper.close()
        print(f"스크래핑 완료 및 브라우저 종료: {department.site}\n")

    # WebDriver 및 DB 연결 닫기
    db_connection.close()
    print("전정대학 작업 완료.")

이 main.py는 큰 단과대학 별로 코드를 작성했다(전체를 합치면 코드를 보고 유지보수하는데 힘들것 같고 각각 따로 실행한다면 중복된느 코드수가 너무 많아지기 때문에 적당히 단과대 별로 나눴다)

또한 매번 실행된느 코드 특성상 데이터를 중복해서 저장할 수 있기 때문에 url을 기준으로 이미 db에 해당 url이 있다면 저장하지 않도록 했다

검색엔진

This post is licensed under CC BY 4.0 by the author.

1. 공지사항을 크롤링 하기위한 기본 클래스 생성

2. 각 공지사항 페이지 정보

3. 크롤링을 실행하는 main.py 작성

Trending Tags