본문 바로가기
GIS Tech/GIS Data Process

Instagram Crawling by shortcode

by mpv 2021. 1. 8.

인스타그램 포스트의 이미지의 경우 시간이 경과하면 해당 링크가 만료되어 이미지 크롤링이 되지 않는 경우가 있었다.

 

Instagra-scraper의 경우에는 (github.com/arc298/instagram-scraper) 로그인 세션 저장, location id 크롤링, 장소 이름으로 크롤링 등 가장 안정적으로 인스타그램 크롤링 기능을 이용할 수 있었다. 이 코드를 활용하여 인스타그램의 shortcode를 이용하여 데이터를 크롤링하는 방법을 소개한다.

 

우선 github.com/arc298/instagram-scraper/blob/master/instagram_scraper/app.py 코드를 이용하여 아래와 같이 MyInstagramScraper의 객체를 만든다.

 

import argparse
import codecs
import configparser
import errno
import glob
from operator import itemgetter
import json
import logging.config
import hashlib
import os
import pickle
import re
import socket
import sys
import textwrap
import time
import xml.etree.ElementTree as ET
#import moviepy.editor as mpe

try:
    from urllib.parse import urlparse
except ImportError:
    from urlparse import urlparse

import warnings
import threading
import concurrent.futures
import requests
import requests.packages.urllib3.util.connection as urllib3_connection
import tqdm



BASE_URL = 'https://www.instagram.com/'
LOGIN_URL = BASE_URL + 'accounts/login/ajax/'
LOGOUT_URL = BASE_URL + 'accounts/logout/'
CHROME_WIN_UA = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
USER_URL = BASE_URL + '{0}/?__a=1'
USER_INFO = 'https://i.instagram.com/api/v1/users/{0}/info/'

MAIN_STORIES_URL = BASE_URL + 'graphql/query/?query_hash=45246d3fe16ccc6577e0bd297a5db1ab&variables=%7B%22reel_ids%22%3A%5B%22{0}%22%5D%2C%22tag_names%22%3A%5B%5D%2C%22location_ids%22%3A%5B%5D%2C%22highlight_reel_ids%22%3A%5B%5D%2C%22precomposed_overlay%22%3Afalse%7D'
HIGHLIGHT_STORIES_USER_ID_URL = BASE_URL + 'graphql/query/?query_hash=c9100bf9110dd6361671f113dd02e7d6&variables=%7B%22user_id%22%3A%22{0}%22%2C%22include_chaining%22%3Afalse%2C%22include_reel%22%3Afalse%2C%22include_suggested_users%22%3Afalse%2C%22include_logged_out_extras%22%3Afalse%2C%22include_highlight_reels%22%3Atrue%2C%22include_related_profiles%22%3Afalse%7D'
HIGHLIGHT_STORIES_REEL_ID_URL = BASE_URL + 'graphql/query/?query_hash=45246d3fe16ccc6577e0bd297a5db1ab&variables=%7B%22reel_ids%22%3A%5B%5D%2C%22tag_names%22%3A%5B%5D%2C%22location_ids%22%3A%5B%5D%2C%22highlight_reel_ids%22%3A%5B%22{0}%22%5D%2C%22precomposed_overlay%22%3Afalse%7D'
STORIES_UA = 'Instagram 123.0.0.21.114 (iPhone; CPU iPhone OS 11_4 like Mac OS X; en_US; en-US; scale=2.00; 750x1334) AppleWebKit/605.1.15'

BROADCAST_URL = BASE_URL + 'api/v1/feed/user/{0}/story/'

TAGS_URL = BASE_URL + 'explore/tags/{0}/?__a=1'
LOCATIONS_URL = BASE_URL + 'explore/locations/{0}/?__a=1'
VIEW_MEDIA_URL = BASE_URL + 'p/{0}/?__a=1'
SEARCH_URL = BASE_URL + 'web/search/topsearch/?context=blended&query={0}'

QUERY_FOLLOWINGS = BASE_URL + 'graphql/query/?query_hash=c56ee0ae1f89cdbd1c89e2bc6b8f3d18&variables={0}'
QUERY_FOLLOWINGS_VARS = '{{"id":"{0}","first":50,"after":"{1}"}}'

QUERY_COMMENTS = BASE_URL + 'graphql/query/?query_hash=33ba35852cb50da46f5b5e889df7d159&variables={0}'
QUERY_COMMENTS_VARS = '{{"shortcode":"{0}","first":50,"after":"{1}"}}'

QUERY_HASHTAG = BASE_URL + 'graphql/query/?query_hash=ded47faa9a1aaded10161a2ff32abb6b&variables={0}'
QUERY_HASHTAG_VARS = '{{"tag_name":"{0}","first":50,"after":"{1}"}}'

QUERY_LOCATION = BASE_URL + 'graphql/query/?query_hash=ac38b90f0f3981c42092016a37c59bf7&variables={0}'
QUERY_LOCATION_VARS = '{{"id":"{0}","first":50,"after":"{1}"}}'

QUERY_MEDIA = BASE_URL + 'graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables={0}'
QUERY_MEDIA_VARS = '{{"id":"{0}","first":50,"after":"{1}"}}'

MAX_CONCURRENT_DOWNLOADS = 5
CONNECT_TIMEOUT = 90
MAX_RETRIES = 5
RETRY_DELAY = 5
MAX_RETRY_DELAY = 60

LATEST_STAMPS_USER_SECTION = 'users'


class MyInstagramScraper(object):
    """InstagramScraper scrapes and downloads an instagram user's photos and videos"""
    
    def __init__(self, **kwargs):
        default_attr = dict(username='', usernames=[], filename=None,
                            login_user=None, login_pass=None,
                            followings_input=False, followings_output='profiles.txt',
                            destination='./', logger=None, retain_username=False, interactive=False,
                            quiet=False, maximum=0, media_metadata=False, profile_metadata=False, latest=False,
                            latest_stamps=False, cookiejar=None, filter_location=None, filter_locations=None,
                            media_types=['image', 'video', 'story-image', 'story-video', 'broadcast'],
                            tag=False, location=False, search_location=False, comments=False,
                            verbose=0, include_location=False, filter=None, proxies={}, no_check_certificate=False,
                                                        template='{urlname}', log_destination='')

        allowed_attr = list(default_attr.keys())
        default_attr.update(kwargs)

        for key in default_attr:
            if key in allowed_attr:
                self.__dict__[key] = default_attr.get(key)

        self.session = requests.Session()
        if self.no_check_certificate:
            self.session.verify = False

        try:
            if self.proxies and type(self.proxies) == str:
                self.session.proxies = json.loads(self.proxies)
        except ValueError:
            self.logger.error("Check is valid json type.")
            raise

        self.session.headers = {'user-agent': CHROME_WIN_UA}
        if self.cookiejar and os.path.exists(self.cookiejar):
            with open(self.cookiejar, 'rb') as f:
                self.session.cookies.update(pickle.load(f))
        self.session.cookies.set('ig_pr', '1')
        self.rhx_gis = ""

        self.cookies = None
        self.authenticated = False
        self.logged_in = False
        self.last_scraped_filemtime = 0
        self.initial_scraped_filemtime = 0
        if default_attr['filter']:
            self.filter = list(self.filter)
        self.quit = False

    def sleep(self, secs):
        min_delay = 1
        for _ in range(secs // min_delay):
            time.sleep(min_delay)
            if self.quit:
                return
        time.sleep(secs % min_delay)
        
        
    def safe_get(self, *args, **kwargs):
            # out of the box solution
            # session.mount('https://', HTTPAdapter(max_retries=...))
            # only covers failed DNS lookups, socket connections and connection timeouts
            # It doesnt work when server terminate connection while response is downloaded
            retry = 0
            retry_delay = RETRY_DELAY
            while True:
                if self.quit:
                    return
                try:
                    response = self.session.get(timeout=CONNECT_TIMEOUT, cookies=self.cookies, *args, **kwargs)
                    if response.status_code == 404:
                        return
                    response.raise_for_status()
                    content_length = response.headers.get('Content-Length')
                    if content_length is not None and len(response.content) != int(content_length):
                        #if content_length is None we repeat anyway to get size and be confident
                        raise PartialContentException('Partial response')
                    return response
                except (KeyboardInterrupt):
                    raise
                except (requests.exceptions.RequestException, PartialContentException) as e:
                    if 'url' in kwargs:
                        url = kwargs['url']
                    elif len(args) > 0:
                        url = args[0]
                    if retry < MAX_RETRIES:
                        self.logger.warning('Retry after exception {0} on {1}'.format(repr(e), url))
                        self.sleep(retry_delay)
                        retry_delay = min( 2 * retry_delay, MAX_RETRY_DELAY )
                        retry = retry + 1
                        continue
                    else:
                        keep_trying = self._retry_prompt(url, repr(e))
                        if keep_trying == True:
                            retry = 0
                            continue
                        elif keep_trying == False:
                            return
                    raise
                    
    def get_json(self, *args, **kwargs):
        """Retrieve text from url. Return text as string or None if no data present """
        resp = self.safe_get(*args, **kwargs)

        if resp is not None:
            return resp.text
        
    def authenticate_as_guest(self):
        """Authenticate as a guest/non-signed in user"""
        self.session.headers.update({'Referer': BASE_URL, 'user-agent': STORIES_UA})
        req = self.session.get(BASE_URL)

        self.session.headers.update({'X-CSRFToken': req.cookies['csrftoken']})

        self.session.headers.update({'user-agent': CHROME_WIN_UA})
        self.rhx_gis = ""
        self.authenticated = True
        
    def authenticate_with_login(self):
        """Logs in to instagram."""
        self.session.headers.update({'Referer': BASE_URL, 'user-agent': STORIES_UA})
        req = self.session.get(BASE_URL)

        self.session.headers.update({'X-CSRFToken': req.cookies['csrftoken']})

        login_data = {'username': 'InstagramID@abc.com', 'password': 'InstagramPW'}
        login = self.session.post(LOGIN_URL, data=login_data, allow_redirects=True)
        self.session.headers.update({'X-CSRFToken': login.cookies['csrftoken']})
        self.cookies = login.cookies
        login_text = json.loads(login.text)

가장 필수적인 get_json과 safe_get 함수, 그리고 authentication 부분만을 가져왔다.

이후 활용할 때는

mis = MyInstagramScraper()
mis.authenticate_with_login()

로 계정 로그인을 하여 (로그인은 authenticate_with_login 함수 아래 login_data를 설정하여 진행한다) 크롤링 기능을 사용할 수 있다.

 

크롤링의 경우 인스타그램 shortcode를 가지고 있다고 했을때, 다음과 같이 가능하다.

import requests
import time
from PIL import Image
from io import BytesIO
def crawl_image(save_dir, post_info):
    shortcode = post_info['shortcode']
    imgurl = post_info['display_resources'][0]['src']
    jpgfile = os.path.join(save_dir, shortcode + '.jpg')
    if os.path.isfile(jpgfile):
        print("skip", jpgfile, sep=' | ')
    try:
        r = requests.get(imgurl)
        b = BytesIO(r.content)
        img = Image.open(b)
        if not os.path.isdir(save_dir):
            os.mkdir(save_dir)
        img.save(jpgfile)
        print('  ', jpgfile, sep=' | ')
        time.sleep(.5)
    except:
        print('\timg load fail', sep=' | ')
        
        
curl = 'https://www.instagram.com/p/{}/?__a=1'.format(shortcode)
save_dir = "." #저장경로 설정

json_str = mis.get_json(curl)
post_info = json.loads(json_str)['graphql']['shortcode_media']
crawl_image(save_dir, post_info)

이와 같이 경로를 설정하고 해당 경로의 포스트 정보 json을 다운로드하여 거기에 있는 display_resources의 첫번째 src를 저장한다. (원본보다 축소된 이미지 사이즈의 데이터이며, 원본의 데이터를 얻고자 할 경우

imgurl = post_info['display_url']

을 하면 된다. 다만 이 설정은 인스타그램이 업데이트되며 조금씩 달라질 수 있다.)

댓글