[Crawling] 인스타그램(Instagram) 크롤링하기

[Crawling] 인스타그램(Instagram) 크롤링하기

2021, Dec 04    


Crawling Code


from selenium import webdriver as wd        # python interpreter 설정 주의
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
# from selenium.webdriver.support.ui import WebDriverWait as Wait
# from selenium.webdriver.support import expected_conditions as EC

import time

driver = wd.Chrome("chromedriver.exe")

def login(user_id, user_pw):
    driver.get("https://www.instagram.com/accounts/login/")     # instagram url
    time.sleep(4)

    e_id = driver.find_elements_by_class_name("_2hvTZ")[0]     # id(username)
    e_id.send_keys(user_id)
    e_pw = driver.find_elements_by_class_name("_2hvTZ")[1]      # pw
    e_pw.send_keys(user_pw)
    e_pw.send_keys(Keys.ENTER)
    time.sleep(5)

    e = driver.find_elements_by_class_name("sqdOP")[0]
    e.click()
    time.sleep(4)

    e = driver.find_elements_by_class_name("aOOlW")[1]
    e.click()
    time.sleep(4)


def get_content():
    html = driver.page_source
    soup = BeautifulSoup(html, 'lxml')
    try:
        content = soup.select('div.C4VMK > span')[0].text
    except:
        content = " "

    try:
        like = soup.select('a.zV_Nj')[0].text[4:-1]
    except:
        like = 0
    
    tags = re.findall(r'#[ㄱ-ㅎㅏ-ㅣ가-힣 ]+', content)
    tag = ''.join(tags).replace('#', ' ')
    tags = tag.split()
    content = re.sub(r"[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", content)
    
    date = soup.select('time._1o9PC.Nzb55')[0]['datetime'][:10]
    
    return [content, tags, like, date]


def go_next():
    driver.find_element_by_class_name('l8mY4').click()
    time.sleep(10)


import pandas as pd
def createDF(result, idx):          # 200 개 피드 단위로 DataFrame 화하고 엑셀 파일로 저장
    res_df = pd.DataFrame(result)
    res_df.columns = ['contnet', 'tags', 'like', 'date']
    res_df.to_excel("insta_crawling_" + str(keyword) + "_" + str(idx) + ".xlsx")


def searchBy(keyword):
    url = "https://www.instagram.com/explore/tags/" + keyword
    driver.get(url)
    time.sleep(6)

    first = driver.find_elements_by_class_name("_9AhH0")[0]
    first.click()
    driver.implicitly_wait(5)

    target = 5001
    result = []
    for i in range(1, target):
        if i%200 == 0:
            createDF(result, i)
            result = []
        try:
            result.append(get_content())
            go_next()
        except:
            time.sleep(5)
            go_next()

    return result


if __name__ == "__main__":
    keyword = input("검색어를 입력하세요 : ")
    user_id = "사용자id"
    user_pw = "사용자pwd"
    
    login(user_id, user_pw)
    searchBy(keyword)


한계


  • 피드 5000~8000 개 정도 크롤링하다보면 인스타그램에서 자체적으로 block 을 한다.
  • 이후로는 피드를 클릭하면 아무 것도 뜨지 않기 때문에 계속 새로운 계정을 생성하면서 진행했다.