[Crawling] Crawling Twitter Feeds with Selenium (with Date Interval)

[Crawling] Crawling Twitter Feeds with Selenium (with Date Interval)

2021, Dec 27    


Crawling Code


from selenium import webdriver as wd        
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import datetime as dt
import urllib.parse             
import time


def extract_text(tweets):
    tagout = re.compile('<.*?>')
    unicodeout = re.compile(r'"[\\u]%d{4,5}"')
    tw = []
    for t in tweets:
        t = re.sub(tagout, "", str(t))
        t = re.sub(unicodeout, "", t)
        t = re.sub(r"[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", t)
        tw.append(t)
    return tw



def get_freq_only(keyword, startdate, middate, enddate):      # text 데이터 제외하고 특정 검색어가 포함된 게시글 수만 확인
    total_freq = []
    keyword_parse = urllib.parse.quote_plus(keyword)
    
    while startdate != enddate:        
        url = "https://twitter.com/search?q=" + keyword_parse + "%20since%3A" + str(startdate) + "%20until%3A" + str(middate) + "&src=typed_query&f=top"
        driver.get(url)
        time.sleep(5)

        html = driver.page_source
        soup = BeautifulSoup(html,'html.parser')
        last_height = driver.execute_script("return document.body.scrollHeight")
        
        wordfreq = 0
        dailyfreq = {'Date' : startdate}
        tweets = soup.find_all("div", {'class' : "css-901oao r-18jsvk2 r-37j5jr r-a023e6 r-16dba41 r-rjixqe r-bcqeeo r-bnwqim r-qvutc0"})
        wordfreq += len(tweets)
        
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
            time.sleep(1.5)
            new_height = driver.execute_script("return document.body.scrollHeight")

            if new_height != last_height:
                html = driver.page_source
                soup = BeautifulSoup(html,'html.parser')
                tweets = soup.find_all("div", {'class' : "css-901oao r-18jsvk2 r-37j5jr r-a023e6 r-16dba41 r-rjixqe r-bcqeeo r-bnwqim r-qvutc0"})
                wordfreq += len(tweets)

            else:                                   # 일별로 검색 후 끝까지 scroll 다 내림
                dailyfreq['Frequency'] = wordfreq
                total_freq.append(dailyfreq)        # 일별 단어 빈도수 기록
                
                startdate = middate
                middate += dt.timedelta(days=1)
                break

            last_height = new_height
    
    return total_freq



def search_twitter(keyword, startdate, middate, enddate):       
    tweets_bag = []
    keyword_parse = urllib.parse.quote_plus(keyword)

    while startdate != enddate:         # interval : 7 day
        url = "https://twitter.com/search?q=" + keyword_parse + "%20since%3A" + str(startdate) + "%20until%3A" + str(middate) + "&src=typed_query&f=top"
        driver.get(url)
        time.sleep(5)
        
        # 인기글로 태그하면 노출되는 게시글이 적기 때문에 최신게시글 페이지로 이동
        latest = driver.find_element_by_css_selector("#react-root > div > div > div.css-1dbjc4n.r-18u37iz.r-13qz1uu.r-417010 > main > div > div > div > div > div > div.css-1dbjc4n.r-aqfbo4.r-14lw9ot.r-gtdqiz.r-1gn8etr.r-1g40b8q > div:nth-child(2) > nav > div > div.css-1dbjc4n.r-1adg3ll.r-16y2uox.r-1wbh5a2.r-1pi2tsx.r-1udh08x > div > div:nth-child(2) > a")       
        latest.click()
        time.sleep(5)

        html = driver.page_source
        soup = BeautifulSoup(html,'html.parser')
        last_height = driver.execute_script("return document.body.scrollHeight")
        
        tw = []
        weeklyfreq = 0
        tweets = soup.find_all("div", {'class' : "css-901oao r-18jsvk2 r-37j5jr r-a023e6 r-16dba41 r-rjixqe r-bcqeeo r-bnwqim r-qvutc0"})
        tw += extract_text(tweets)
        weeklyfreq += len(tweets)

        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
            time.sleep(1.5)
            new_height = driver.execute_script("return document.body.scrollHeight")

            if new_height != last_height:
                html = driver.page_source
                soup = BeautifulSoup(html,'html.parser')
                
                tweets = soup.find_all("div", {'class' : "css-901oao r-18jsvk2 r-37j5jr r-a023e6 r-16dba41 r-rjixqe r-bcqeeo r-bnwqim r-qvutc0"})
                tw += extract_text(tweets)
                weeklyfreq += len(tweets)
            else:                                   # 주별로 검색 후 끝까지 scroll 다 내림
                tweets_bag.append([startdate, weeklyfreq, tw])
                startdate = middate
                middate += dt.timedelta(days=7)
                break

            last_height = new_height
    
    return tweets_bag



def createDF(total_freq, tweets_bag, keyword, tag):
    import pandas as pd
    if yearly_freq:      # get_freq_only 함수 사용 결과
        df1 = pd.DataFrame(total_freq)
        df1.to_excel("Total_Freq_" + keyword + "_" + str(tag) + ".xlsx")

    if tweets_bag:          # search_twitter 함수 사용 결과 
        df2 = pd.DataFrame(tweets_bag, columns=['Date', 'Weekly Frequency', 'Tweets'])
        df2.to_excel("Tweets_" + keyword + "_" + str(tag) + ".xlsx")



if __name__ == "__main__":

    keywords = ['코로나, 감정', '코로나, 기분', '코로나, 일상']
    years = [2020, 2021]
    driver = wd.Chrome("chromedriver.exe")
    yearly_freq = []
    tweets_bag = []

    for keyword in keywords:
        for year in years:
            if year == 2020: limit = 13
            elif year == 2021: limit = 11
            for i in range(1, limit):      # 1~12 month per year
                startdate = dt.date(year=year,month=i,day=1)
                middate = dt.date(year=year,month=i,day=7)
                enddate = dt.date(year=year,month=i,day=28)
                tweets_bag += search_twitter(keyword, startdate, middate, enddate)

                if i == 6 or i == limit-1:      # 6개월 단위로 잘라서 엑셀파일에 저장
                    if i == 6: add = "_상반기"
                    elif i == limit-1: add = "_하반기"
                    tag = str(year) + "년" + add
                    createDF(yearly_freq, tweets_bag, keyword, tag)
                    tweets_bag = []

        # yearly_freq = get_freq_only(keyword, startdate, middate, enddate)


  • 트위터는 생각보다 광고가 너무 많아 junk data 문제가 있었다. 제거하려면 일일이 노가다를 해야하는데 너무 귀찮아서 그냥 생략했다. 광고 패턴을 파악해 정규식을 활용할 수 있을 거 같다.
  • 키워드, 날짜로 검색하게 되면 인기, 최신, 등등의 피드 노출 옵션이 있는데 인기게시글로 가면 노출되는 피드 수가 적어서 최신 피드를 크롤링했다.