[Crawling] 인스타그램(Instagram) 크롤링하기
2021, Dec 04
Crawling Code
from selenium import webdriver as wd # python interpreter 설정 주의
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
# from selenium.webdriver.support.ui import WebDriverWait as Wait
# from selenium.webdriver.support import expected_conditions as EC
import time
driver = wd.Chrome("chromedriver.exe")
def login(user_id, user_pw):
driver.get("https://www.instagram.com/accounts/login/") # instagram url
time.sleep(4)
e_id = driver.find_elements_by_class_name("_2hvTZ")[0] # id(username)
e_id.send_keys(user_id)
e_pw = driver.find_elements_by_class_name("_2hvTZ")[1] # pw
e_pw.send_keys(user_pw)
e_pw.send_keys(Keys.ENTER)
time.sleep(5)
e = driver.find_elements_by_class_name("sqdOP")[0]
e.click()
time.sleep(4)
e = driver.find_elements_by_class_name("aOOlW")[1]
e.click()
time.sleep(4)
def get_content():
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
try:
content = soup.select('div.C4VMK > span')[0].text
except:
content = " "
try:
like = soup.select('a.zV_Nj')[0].text[4:-1]
except:
like = 0
tags = re.findall(r'#[ㄱ-ㅎㅏ-ㅣ가-힣 ]+', content)
tag = ''.join(tags).replace('#', ' ')
tags = tag.split()
content = re.sub(r"[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", content)
date = soup.select('time._1o9PC.Nzb55')[0]['datetime'][:10]
return [content, tags, like, date]
def go_next():
driver.find_element_by_class_name('l8mY4').click()
time.sleep(10)
import pandas as pd
def createDF(result, idx): # 200 개 피드 단위로 DataFrame 화하고 엑셀 파일로 저장
res_df = pd.DataFrame(result)
res_df.columns = ['contnet', 'tags', 'like', 'date']
res_df.to_excel("insta_crawling_" + str(keyword) + "_" + str(idx) + ".xlsx")
def searchBy(keyword):
url = "https://www.instagram.com/explore/tags/" + keyword
driver.get(url)
time.sleep(6)
first = driver.find_elements_by_class_name("_9AhH0")[0]
first.click()
driver.implicitly_wait(5)
target = 5001
result = []
for i in range(1, target):
if i%200 == 0:
createDF(result, i)
result = []
try:
result.append(get_content())
go_next()
except:
time.sleep(5)
go_next()
return result
if __name__ == "__main__":
keyword = input("검색어를 입력하세요 : ")
user_id = "사용자id"
user_pw = "사용자pwd"
login(user_id, user_pw)
searchBy(keyword)
한계
- 피드 5000~8000 개 정도 크롤링하다보면 인스타그램에서 자체적으로 block 을 한다.
- 이후로는 피드를 클릭하면 아무 것도 뜨지 않기 때문에 계속 새로운 계정을 생성하면서 진행했다.