txtsd
/
ripLINEStickers


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
							import ast
import os
import re
import sys
from enum import IntEnum, unique
from queue import Queue
from threading import Thread

import requests
from bs4 import BeautifulSoup as bs


# Author link example:
# https://store.line.me/stickershop/author/97829/en

# Pack link example:
# https://store.line.me/stickershop/product/1000509/en

# Sticker link example:
# https://stickershop.line-scdn.net/stickershop/v1/sticker/67058943/iPhone/sticker@2x.png
# https://stickershop.line-scdn.net/stickershop/v1/sticker/73694/android/sticker.png


@unique
class Enums(IntEnum):
    AUTHOR = 1
    PACK = 2
    ERROR = 3


DIR_STICKERS: str = "stickers"

Q1: Queue = Queue(maxsize=0)
Q2: Queue = Queue(maxsize=0)
Q3: Queue = Queue(maxsize=0)

NUM_THREADS: int = 8

DOMAIN: str = "https://store.line.me"

sticker_count: int = 0
set_count: int = 0


def rip_line_stickers(argument: str):
    session = requests.Session()

    template_static = "https://stickershop.line-scdn.net/stickershop/v1/sticker/{}/{}/{}.{}"
    pattern_link_sticker = re.compile(
        r"https://stickershop.line-scdn.net/stickershop/v1/sticker/(?P<id>\d+)/(?P<platform>.+)/(?P<filename_with_ext>(?P<filename>.+)\.(?P<ext>.+))")
    pattern_a_next = re.compile(r"\?page=\d+")

    def determine_type(link: str):
        if "/author/" in link:
            return Enums.AUTHOR
        elif "/product/" in link:
            return Enums.PACK
        else:
            return Enums.ERROR

    def get_pack(link: str):
        result = session.get(link)
        result.raise_for_status()

        soup = bs(result.content, "lxml")
        tag_p = soup.find_all("p")
        name_pack = None
        for p in tag_p:
            if p.has_attr("data-test") and p["data-test"] == "sticker-name-title":
                name_pack = p.string

        tag_li = soup.find_all("li")
        for li in tag_li:
            if li.has_attr("data-preview"):
                data_preview = ast.literal_eval(li["data-preview"])

                if data_preview["type"] == "static":
                    url = data_preview["staticUrl"]
                    if "/sticonshop/" not in url:
                        match = pattern_link_sticker.search(url)
                        try:
                            if match.group("platform") == "android":
                                url = template_static.format(match.group("id"), "iPhone",
                                                             match.group("filename") + "@2x",
                                                             match.group("ext"))
                        except Exception as e:
                            print("Exception while processing {}".format(name_pack))
                            print(e)
                            print(tag_li)
                elif data_preview["type"] == "animation":
                    url = data_preview["animationUrl"]
                    match = pattern_link_sticker.search(url)
                else:
                    print("Encountered unknown type: {}".format(data_preview["type"]))
                    exit(2)

                if "/sticonshop/" not in url:
                    Q1.put(
                        {
                            "name_pack": name_pack,
                            "filename": match.group("id"),
                            "ext": match.group("ext"),
                            "link": url
                        }
                    )

    def get_author(link: str):
        result = session.get(link)
        result.raise_for_status()

        soup = bs(result.content, "lxml")
        tag_li = soup.find_all("li")
        for li in tag_li:
            if li.has_attr("data-test"):
                data_test = li["data-test"]
                if data_test == "author-item":
                    Q2.put(
                        {
                            "link": "{}{}".format(DOMAIN, li.a.get("href"))
                        }
                    )
        tag_a = soup.find_all("a")
        for a in tag_a:
            if a.has_attr("href") and a.text == "Next":
                if pattern_a_next.search(a.get("href")):
                    link = link.split("?")[0]
                    link = "{}{}".format(link, a.get("href"))
                    get_author(link)

    def threaded_scrape(q: Queue):
        while True:
            things = q.get()
            name_pack = things['name_pack']
            filename = things['filename']
            ext = things['ext']
            link = things['link']
            path = "{}/{}/{}.{}".format(DIR_STICKERS, name_pack, filename, ext)

            if not os.path.exists("{}/{}".format(DIR_STICKERS, name_pack)):
                os.makedirs("{}/{}".format(DIR_STICKERS, name_pack), exist_ok=True)

            with open(path, "wb") as f:
                result = session.get(link)
                result.raise_for_status()
                f.write(result.content)
                global sticker_count
                sticker_count += 1

            q.task_done()

    def threaded_crawl(q: Queue):
        while True:
            things = q.get()
            link = things["link"]
            get_pack(link)
            global set_count
            set_count += 1

            q.task_done()

    link_type = determine_type(argument)

    if link_type == Enums.AUTHOR:
        get_author(argument)
    elif link_type == Enums.PACK:
        get_pack(argument)
    else:
        print("Could not determine link type!")
        exit(1)

    for i in range(NUM_THREADS):
        worker = Thread(target=threaded_crawl, args=(Q2,))
        worker.daemon = True
        worker.start()

    for i in range(NUM_THREADS):
        worker = Thread(target=threaded_scrape, args=(Q1,))
        worker.daemon = True
        worker.start()

    Q2.join()
    print("Queued", set_count, "sets!")
    Q1.join()
    print("Downloaded", sticker_count, "files!")


if __name__ == '__main__':
    if len(sys.argv) <= 1:
        print("Pass a link as an argument.")
    else:
        rip_line_stickers(sys.argv[1])