from lxml.html import fromstring
import httpx
import os,asyncio,time,datetime,pytz,sys,hashlib
import pathlib

ROOTSRC = "https://tsumanne.net"


def get_src(index=0):
    search_site = "https://tsumanne.net/si/%E3%82%B7%E3%83%A7%E3%83%B3%E3%83%9C%E3%83%AA%E3%83%AB%E3%83%89%E3%83%AB%E3%83%95"
    search_resp = httpx.get(search_site)
    assert search_resp.status_code == 200
    search_text = fromstring(search_resp.text)
    
    src_list = search_text.xpath("//table[@id='logs']//td[@class='url']/a[not(img)]/@href")


    
    src = src_list[index]
    return ROOTSRC + src


def add_ref(src,xpath):
    res = "\n".join(src.xpath(xpath))
    if res:
        return res + "\n"
    return res


def convert_url(texts):
    urls = []
    for text in texts:
        if text.startswith("fu") and \
            not text.endswith(".mp4") and \
            not text.endswith(".webm"):
            urls.append("http://dec.2chan.net/up2/src/" + text)
        elif text.startswith("f") and \
            not text.endswith(".mp4") and \
            not text.endswith(".webm"):
            urls.append("http://dec.2chan.net/up/src/" + text)
    return urls

def get_urls_data(src):
    resp = httpx.get(src)
    assert resp.status_code == 200
    resp.encoding = "shift_jis"

    html = fromstring(resp.text)

    header = html.xpath("//div[@class='thre']/a[img]/@href")[0]

    links = html.xpath("//table//blockquote[starts-with(a/@href,'fu') or starts-with(a/@href,'f') ]")

    for link in links:
        hrefs = [ href for href in link.xpath("./a/@href") if not href.endswith(".mp4") and not href.endswith(".webm")]
        all_rly = "".join(link.xpath("./text()"))
        if not hrefs or "削除依頼によって隔離されました" in all_rly:
            links.remove(link)

    all_blocks = html.xpath("//table//blockquote")
    zips = []
    for block in all_blocks:
        if block not in links and block not in zips:
            all_rly = "".join(block.xpath("./text()"))
            if "削除依頼によって隔離されました" in all_rly:
                continue
            for reply in block.xpath("./text()"):
                if (reply.startswith("f") or reply.startswith("fu")) and \
                    not ((reply.endswith(".mp4") or reply.startswith(".webm"))):
                    zips.append(block)
                    continue

    links_dict = []
    for link in links:
        if "削除依頼によって隔離されました" not in "\n".join(link.xpath("./text()")):
            url = convert_url(link.xpath("./text()"))
            url.extend(link.xpath("./a/@href"))
            links_dict.append({"reply":add_ref(link,"./font/text()") + "\n".join(link.xpath("./text()")),"url":url})
    
    zips_dict = [ {"reply":add_ref(i,"./font/text()") + "\n".join(i.xpath("./text()")), "url":convert_url(i.xpath("./text()"))} for i in zips ]



    return {"header":header,"links":links_dict,"zips":zips_dict}


def mkdir(src):
    if not os.path.exists(src):
        os.mkdir(src)



async def single_fetch(client,url,path,root):
    try:
        resp = await client.get(url, timeout=3600)
    except httpx.TimeoutException:
        print(f"Get {url} timedout,ignored it.")
        return
    except:
        print(f"Get {url} for many reason,ignored it.")
        return
    if resp.status_code != 200:
        print(f"get status code {resp.status_code} in {url},ignored it.")
    else:
        with open(os.path.join(root,path),"wb") as f:
            f.write(resp.content)

async def fetch(data,src,root):
    header = data["header"]
    imgs = data["links"]
    zips = data["zips"]


    imgs_list = [header]
    for i in imgs:
        imgs_list.extend(i["url"])
    
    zip_list = []

    for j in zips:
        zip_list.extend(j["url"])

    mkdir(os.path.join(root,"other"))

    async with httpx.AsyncClient() as r:
        task = []
        zip_task = []
        print("start downloading...")

        for img in imgs_list:
            req = None
            if img.startswith("http"):
                req = single_fetch(r,img,"other/"+pathlib.Path(img).name,root)
            else:
                req = single_fetch(r,src+"/"+img,"other/"+img,root)
            single_task = asyncio.create_task(req)
            task.append(single_task)
        for single_zip in zip_list:
            req = single_fetch(r,single_zip,"other/" + pathlib.Path(single_zip).name,root)
            single_task = asyncio.create_task(req)
            zip_task.append(single_task)

        await asyncio.gather(*task)
        print("Download pic finnish")
        await asyncio.gather(*zip_task)
        print("Download zip finnish")


def parse_reply(reply:str):
    raw = reply.splitlines()
    res = []
    for i in raw:
        if i.startswith(">"):
            res.append(f'<font color="#789922">{i}</font>')
        else:
            res.append(i)
    return "<br>".join(res)

def dump_html(data,root):
    block = '<table border=0><tr><td class=rts>...</td><td class=rtd><span class="rsc">{block_num}</span><br> &nbsp; &nbsp; {block_text}</td></tr></table>'
    block_link = '<a href="{link}">{name}</a><br><a href="{link}"><img src="{thumb}" border=0 align=left width=250 hspace=20 loading="lazy"></a>'
    block_reply = '<blockquote style="margin-left:290px;">{zips}{reply}</blockquote>'
    block_reply_noimg = '<blockquote>{zips}{reply}</blockquote>'

    
    header_block = block_link.format_map({"link":"other/"+data["header"],"name":data["header"],"thumb":"other/"+data["header"]}) + "\n"

    zip_blocks = ""
    zip_block = '<a target=_blank href="{link}">{name}</a><br>'
    
    # img_reply = [ i["reply"] for i in data["links"] ]
    no_img_block = [ i for i in data["zips"]]
    no_img_block.extend([ i for i in data["links"] ])

    for i in data["zips"]:
        for j in i["url"]:
            zip_block_dict = {"link":"other/"+pathlib.Path(j).name,"name":pathlib.Path(j).name}
            zip_blocks += "\n" + zip_block.format_map(zip_block_dict)

    for i in data["links"]:
        for j in i["url"]:
            if pathlib.Path(j).name.endswith(".zip"):
                zip_block_dict = {"link":"other/"+pathlib.Path(j).name,"name":pathlib.Path(j).name}
                zip_blocks += "\n" + zip_block.format_map(zip_block_dict)

    img_blocks = ""
    
    for resti,restv in enumerate(no_img_block):
        inner_zip_block = ""
        for j in restv["url"]:
            zip_block_dict = {"link":"other/"+pathlib.Path(j).name,"name":pathlib.Path(j).name}
            inner_zip_block += zip_block.format_map(zip_block_dict)
        block_reply_dict = {"reply":parse_reply(restv["reply"]),"zips":inner_zip_block}
        content = block_reply_noimg.format_map(block_reply_dict)
        block_dict = {"block_num":resti+1,"block_text":content}
        img_blocks += "\n" + block.format_map(block_dict)
    

    with open(os.path.join(os.getcwd(),"originpre.html"),"r",encoding="utf-8") as orgin:
        orgin_text = orgin.read()
    with open(os.path.join(root,"index.html"),"w",encoding="utf-8") as dump:
        dump_text = orgin_text.replace("#header_block#",header_block)
        dump_text = dump_text.replace("#img_blocks#",img_blocks)
        dump_text = dump_text.replace("#zip_blocks#",zip_blocks)
        dump.write(dump_text)
    print("Dumped!")



def _main(arg=[None]):
    ranges = [0]

    if len(sys.argv) > 1:
        arg = sys.argv
    print(arg)

    if len(arg) > 1 and arg[1].isdigit():
        ranges = [int(arg[1])]
    elif len(arg) > 1 and  not arg[1].isdigit():
        start,end = arg[1].split("-")
        if not start and end:
            ranges = list(range(int(end)))
        elif not end and start:
            ranges = list(range(int(start),30))
        else:
            ranges = list(range(int(start),int(end)))
    print(ranges)
    for index in ranges:
        theday = datetime.datetime.fromtimestamp(time.time(), tz=pytz.timezone('Etc/GMT-8'))
        theday = theday - datetime.timedelta(days=index)
        root = os.path.join(os.getcwd(),datetime.datetime.strftime(theday,'%Y_%m_%d'))
        mkdir(root)
        try:
            src = get_src(index)
        except AssertionError:
            raise AssertionError("Get Src Failed")

        try:
            data = get_urls_data(src)
        except AssertionError:
            raise AssertionError("Get URL Data Failed")
        
        asyncio.run(fetch(data,src,root))
        dump_html(data,root)

if __name__ == "__main__":
    print(os.getcwd())
    _main()