Python Instagram 해시태그 크롤링

2018. 12. 22. 13:00
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import time
import sys
from collections.abc import MutableMapping as MappingMixin  # noqa 
from collections.abc import Sequence  # noqa 
import pymssql
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
 
tagName = "태그이름"
 
url = "https://www.instagram.com/explore/tags/" + tagName + "/"
 
#크롬 드라이버 위치
DRIVER_DIR = 'C:/Users/User/AppData/Local/Programs/Python/Python37-32/Lib/site-packages/chromedriver.exe'
cOptions = webdriver.ChromeOptions()
cOptions.add_argument('headless')
cOptions.add_argument('window-size=1920x1080')
cOptions.add_argument("disable-images")
driver = webdriver.Chrome(DRIVER_DIR,options=cOptions)
driver.implicitly_wait(5) 
driver.get(url)
 
driver2 = webdriver.Chrome(DRIVER_DIR,options=cOptions)
 
elem = driver.find_element_by_tag_name("body") 
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
 
url_list = []
tag_list = []
pagedowns = 1
 
total = 0
conn = pymssql.connect(server='ip address', user='db id', password='db password', database='db name')
try:
    #총 반복 횟수 지정(사진1개당 1, 사진30개면 30 
    #headless 지우고 한 페이지 지나면 다음 페이지로 넘어가는지 확인 
    #while total < 100:
    while True :
        url_list = []
        urlCountFlag = 1
        elem.send_keys(Keys.PAGE_DOWN)
        time.sleep(1)
        img = driver.find_elements_by_css_selector('div._bz0w > a')
        for i in img:
            url_list.append(i.get_attribute('href'))
        url_list = list(set(url_list))
        for i in url_list :        
            driver2.get(i)
            tagList = driver2.find_elements_by_xpath("//meta[@property='instapp:hashtags']")
            for i in tagList :
                tag_list.append((i.get_attribute("content")).translate(non_bmp_map))
            if len(tag_list) == 0 or ''.join(tag_list) == "" or url_list[urlCountFlag] == "" :
                time.sleep(0.2)
                continue
            print(url_list[urlCountFlag],len(tag_list),sep= ' ------- ')
            cur = conn.cursor()
            cur.execute(
                            " MERGE TagData AS A"
                            " USING  (SELECT %s AS URL) AS B"
                            " ON A.URL = B.URL"
                            " WHEN NOT MATCHED THEN"
                            " INSERT (URL,Tags,BaseTag) VALUES (%s,%s,%s);"
                            , ( url_list[urlCountFlag], url_list[urlCountFlag], ' | '.join(tag_list), tagName )
                       )
            conn.commit()
            total = total + 1
            time.sleep(0.3)
            tag_list = []
            if len(url_list) == urlCountFlag :
                break
            urlCountFlag = urlCountFlag + 1
        #tag_list = list(set(tag_list))
    conn.close()
    driver2.close()
    driver2.quit()
    driver.close()
    driver.quit()
except Exception as e:
    print(e)
    #tag_list = list(set(tag_list))
    conn.close()
    driver2.close()
    driver2.quit()
    driver.close()
    driver.quit()
    
 
 
Colored by Color Scripter
cs
저작자표시 비영리 동일조건
'Python' 카테고리의 다른 글

Python 특정 사이트에서 이미지 다운로드 받기 (0)	2021.02.23
jh hyun

Python Instagram 해시태그 크롤링

'Python' 카테고리의 다른 글

티스토리툴바