#!/usr/bin/env python3

# v00001 - Angelo Pesce
# scrape, parse, generate HTML...

# It currently scrapes all versions (timestamps) of all images (very slow...) - which is probably not necessary - didn't check
# - relies on content hashing to remove duplicates and then "resolves" to only one version of image per post (if any was scraped)

# Post de-duplication relies on hashing the text content of the post - this is not perfect and very sensitive to changes in the forum settings
# during time, and to things that naturally change in posts - like image view counts or signatures etc...

import requests
from bs4 import BeautifulSoup
import time
import pickle
import os
from datetime import datetime
import html
import re
import hashlib



FORCE_RESCRAPE = False # set to True to force rescraping/additional scraping of the forum pages
FORCE_REPARSE = False # set to True to force reparsing of the scraped pages
FORCE_IMAGES = False # set to True to force rescraping/additional scraping of the images

WEBSITE = 'http://c0de517e.com'

if False:
  OUT_DIR = 'ompf' # also used for the pickle filename
  SCRAPE_URL_PREFIX = "ompf.org/forum/viewtopic.php" # prefix
  PAGE_INTRO = '<br/>Preserving the history of the ompf.org forum. Threads scraped from wayback machine.<br/>'
  PAGE_TITLE = 'OMPF Forum.'
else:
  OUT_DIR = 'ompf2' # also used for the pickle filename
  SCRAPE_URL_PREFIX = "ompf2.com/viewtopic.php" # prefix
  PAGE_INTRO = '<br/>Preserving the history of the ompf2.com forum. Threads scraped from wayback machine.<br/>'
  PAGE_TITLE = 'OMPF2 Forum.'

if not os.path.exists(OUT_DIR):
    os.mkdir(OUT_DIR)
if not os.path.exists(OUT_DIR+"/images"):
    os.mkdir(OUT_DIR+"/images")



def get_wayback_urls(url):
  """Fetches archived URLs with a 200 status code from Wayback Machine's CDX API."""
  api_url = f"https://web.archive.org/cdx/search/cdx?url={url}/*&output=json&filter=statuscode:200&limit=20000"
  response = requests.get(api_url)
  data = response.json()
  return data[1:]  # Remove header

def scrape_archived_pages(archived_urls, scraped_pages):
  """
  Scrapes archived pages and stores them in a dictionary, respecting throttling limits.
  """
  print("Scraping archived pages...")
  request_count = 0
  start_time = time.time()

  for url_data in archived_urls:
    digest = url_data[5]
    if digest in scraped_pages:
      print(f"Skipping duplicate/already seen digest: {digest}")
      continue

    timestamp = url_data[1]
    original_url = url_data[2]
    wayback_url = f"https://web.archive.org/web/{timestamp}/{original_url}"

    try:
      response = requests.get(wayback_url)
      response.raise_for_status()
      scraped_pages[digest] = (original_url, timestamp, response.text)
      print(f"Successfully scraped: {wayback_url}", len(scraped_pages))

      request_count += 1
      if request_count >= 5:  # Reduced rate limit
        elapsed_time = time.time() - start_time
        if elapsed_time < 60:
          wait_time = 60 - elapsed_time
          print(f"Throttling: Waiting for {wait_time:.2f} seconds...")
          time.sleep(wait_time)
        request_count = 0
        start_time = time.time()

    except requests.exceptions.RequestException as e:
      print(f"Error scraping {wayback_url}: {e}")

  return scraped_pages

def process_post_tag(tag): # parse some of the tags in the post text
      for br in tag.find_all('br'):
          br.replace_with('\n')
      for cite in tag.find_all('cite'): # processing this before a[href] to avoid processing links in the cite text
          cite.replace_with(cite.get_text())
      for a in tag.find_all('a'):
          if a.findChild('img'): # skip links that contains images
            continue
          link = a['href']
          if 'web.archive.org' in link:
            link = link.split('/', 5)[-1]  # Extract the original URL part - this is important also to properly deduplicate posts later on!
          if link!=a.get_text() and not a.get_text().startswith('http'): # if the link text is the same as the link, we don't need to show it...
            a.replace_with(f"[LINK {link} {a.get_text()}]")
          else:
            a.replace_with(f"[LINK {link}]")
      for quote in tag.find_all('blockquote'):
          quote.replace_with(f' >> {quote.get_text()}')
      for dt in tag.find_all('dt'):
          if 'class' in dt.attrs and dt['class'][-1] == 'attach-image':
            for dd in dt.parent.find_all('dd'): # there can be multiple dd tags, and the view-count one might not be the first one...
              if 'Viewed' in dd.get_text() and 'times' in dd.get_text():
                dd.replace_with("") # We need to remove the "Viewed N times" tag otherwise it will confuse de-duplication.
      imgs = []
      for frame in tag.find_all('iframe'):
            frame.replace_with(f"[IFRAME n/a]")
      for img in tag.find_all('img'):
          if ('class' in img.attrs and (img['class'][-1]=='smilies' or img['class'][-1]=='smiles')) or ('/smilies/' in img['src']) or ('/smiles/' in img['src']): # skip smilies
            img.replace_with(f"[SMILEY {img.get('alt','-')}]")
          else:
            imgs.append(img['src'])
            img.replace_with(f"[IMG #{len(imgs)} {img.get('alt',img.get('title','?'))}]")
      return (tag.get_text(), imgs)

def extract_post_data(scraped_pages):
  """Extracts post data and topic title from scraped HTML, handling different formats."""
  extracted_data = {}
  for digest, (page_url, date, html_text) in scraped_pages.items():
    try:
      soup = BeautifulSoup(html_text, 'html.parser')
      wayback_url = f"https://web.archive.org/web/{date}/{page_url}"  # Reconstruct Wayback URL

      # Try to extract data in the first format (older)
      topic_title_tag = soup.find('a', class_='maintitle')

      if topic_title_tag:
        topic_title = topic_title_tag.get_text()
        topic_nav_tag = soup.find('span', class_='nav').get_text()

        posts = []
        spans = soup.find_all('span')
        i = 0
        while i < len(spans):
          if spans[i].get('class') == ['name']:
            post_data = [spans[i].get_text()]
            i += 1
            if i < len(spans) and spans[i].get('class') == ['postdetails']:
              post_data.append(spans[i].get_text())
              i += 1
              assert i < len(spans) and spans[i].get('class') == ['postdetails']
              post_details2 = spans[i].get_text()
              # Extract date using strptime, ending before "Post subject:"
              post_date_str = post_details2.split("Posted:")[1].split("Post subject:")[0].strip()
              post_date = datetime.strptime(post_date_str, "%a %b %d, %Y %I:%M %p")
              post_data.append(post_details2)  # Add second postdetails
              i += 2  # Increment i by 2 to skip the extra span
              assert i < len(spans) and spans[i].get('class') == ['postbody']
              text,imgs = process_post_tag(spans[i])
              post_data.append(text)
              post_data.append(post_date)  # Add datetime object to post_data
              post_data.append(imgs)
              posts.append(post_data)
              i += 1
          else:
            i += 1

        extracted_data[digest] = (wayback_url, topic_title, topic_nav_tag, posts, "format1")
        print(f"Successfully extracted data (format 1) from: {wayback_url} - {topic_title}")

      else:
        # Try to extract data in the second format
        topic_title_tag = soup.find('h3', class_='first')
        if topic_title_tag:
          topic_title = topic_title_tag.get_text()
          topic_nav_tag = soup.find('li', class_='icon-home')
          
          if topic_nav_tag:
            topic_nav_tag = topic_nav_tag.get_text()
          else:
            topic_nav_tag = soup.find('li', class_='breadcrumbs').get_text() # alternative nav tag

          posts = []
          for p_tag in soup.find_all('p', class_='author'):
            author_text = p_tag.get_text()
            # Extract date using strptime, handling different delimiters and formats
            if "»" in author_text:
              post_date_str = author_text.split("»")[1].strip()
            elif "on" in author_text:
              post_date_str = author_text.split("on")[1].strip()
            else:
              assert False

            if post_date_str:
              try:
                post_date = datetime.strptime(post_date_str, "%d %b %Y, %H:%M")
              except ValueError:
                try:
                  post_date = datetime.strptime(post_date_str, "%a %b %d, %Y %I:%M %p")
                except ValueError:
                  assert False

            post_data = [author_text]
            div_tag = p_tag.find_next_sibling('div', class_='content')
            assert div_tag

            text,imgs = process_post_tag(div_tag)
            post_data.append(text)

            dl_tags = div_tag.find_all_next('dl', class_='postprofile')
            if dl_tags:
                post_data.append(dl_tags[0].get_text())
            else:
                post_data.append('')
            post_data.append(post_date)  # Add datetime object to post_data
            post_data.append(imgs)

            posts.append(post_data)
          extracted_data[digest] = (wayback_url, topic_title, topic_nav_tag, posts, "format2")
          print(f"Successfully extracted data (format 2) from: {wayback_url} - {topic_title}")
        else:
          print(f"Could not extract data from: {wayback_url}")

    except Exception as e:
      print(f"Error extracting data from {wayback_url}: {e}")

  return extracted_data


scraped_data = {}
if os.path.exists(OUT_DIR+".pickle"):
  print(f"Loading scraped pages from {OUT_DIR}.pickle...")
  with open(OUT_DIR+".pickle", 'rb') as f:
    scraped_data = pickle.load(f)

if len(scraped_data)==0 or FORCE_RESCRAPE:
  archived_urls = get_wayback_urls(SCRAPE_URL_PREFIX)
  print("Found",len(archived_urls),"pages to scrape.")
  scrape_archived_pages(archived_urls, scraped_data)
  print(f"Saving scraped pages to {OUT_DIR}.pickle...")
  with open(OUT_DIR+".pickle", 'wb') as f:
    pickle.dump(scraped_data, f)



# to test a single page:
#page = next((d for d in scraped_data if scraped_data[d][0]=='http://ompf2.com/viewtopic.php?f=5&t=2132&sid=be66a3c1c13156d6289be707ce3ae3d9'))
#extract_post_data({page:scraped_data[page]})

extracted_data = []
if not FORCE_REPARSE and os.path.exists(OUT_DIR+"_extracted.pickle"): # we pickle this as well as it take time to generate, impeding fast iteration on the script...
  print(f"Loading extracted data from {OUT_DIR}_extracted.pickle...")
  with open(OUT_DIR+"_extracted.pickle", 'rb') as f:
    extracted_data = pickle.load(f)

if len(extracted_data)==0 or FORCE_REPARSE:
  extracted_data = extract_post_data(scraped_data)
  print(f"Saving extracted data to {OUT_DIR}_extracted.pickle...")
  with open(OUT_DIR+"_extracted.pickle", 'wb') as f:
    pickle.dump(extracted_data, f)

# Print the number of pages with extracted data
print(f"\nExtracted data from {len(extracted_data)} pages.")

data = list(extracted_data.values())
# format2: 0wayback_url, 1topic, 2topic_nav, 3[ ['by _author_ » date', post_text, footer, datetime, imgs] ... ], 4'format2'
# format1: 0wayback_url, 1topic, 2topic_nav, 3[ [author, author_data, footer, post_text, datetime, imgs] ... ], 4'format1'

topics = { page[1]:[] for page in data } # topic -> [ (date, author, text, footer, wayback_url, [[images...],...]) ... ]
topics_nav = {}

posts_already_seen = {}
for page in data:
  wayback_url = page[0]
  topic = page[1]
  topics_nav[topic] = page[2] # save for later :)
  format = page[4]

  for post in page[3]:
    date = post[-2]
    imgs = post[-1]

    if format=='format2':
      author = post[0]
      text = post[1]
      footer = post[2].strip()
      if ' » ' in author:
        author = author.split(' » ')[0][3:] # skip "by "
      else:
        author = author.split(' on ')[0][3:] # skip "by "
    else: # format 1 (older)
      author = post[0]
      text = post[3]
      footer = post[1]+post[2]

    filtered_text = ''.join(text.split()) # again, ignore whitespace
    if '_________________' in filtered_text:
      filtered_text_split = filtered_text.split('_________________')
      if len(filtered_text_split)==2:
        filtered_text = filtered_text_split[0] # In "format1" forum posts there can be a signature embedded in the text - which can change and cause de-duplication to fail
    # SMILEY format can also change between forum versions/settings :/
    filtered_text = re.sub(r'\[SMILEY [^\]]*\]', '', filtered_text)
    #filtered_text = re.sub(r'\[LINK [^\]]*\]', '', filtered_text)
    #filtered_text = re.sub(r'\[IMG [^\]]*\]', '', filtered_text)
    
    phash = hash(filtered_text)

    if not phash in posts_already_seen: # filter out duplicates - which happens if the archive snapshotted the same page at different times...
      pt = [date, author, text, footer, wayback_url, [imgs]]
      topics[topic].append( pt )
      posts_already_seen[phash] = pt[-1] # the image array
    else: 
      posts_already_seen[phash].append(imgs) # keep images in duplicated posts, because they are from different times / have different wayback URLs.
      print('s',end='')

for topic in topics:
  topics[topic].sort( key=lambda p:p[0] )



print("PROCESSING IMAGES")

def process_all_images():
  request_count = 0
  start_time = time.time()

  for topic in topics:
    for post in topics[topic]:
      for img_set in post[-1]:
        for img in img_set:
          if img not in images_db:
              if not(img.startswith("http://") or img.startswith("https://")):
                img_url = "https://web.archive.org"+img # fix relative URLs
                #print("skipping relative URL:", img) # this should not happen...
                continue
              else:
                img_url = img
              
              #ext = img.split('.')[-1].lower()
              #if not('png' in ext or 'jpg' in ext or 'jpeg' in ext or 'gif' in ext):
              #  print("unknown - skipping:", img)
              #  continue

              try:
                response = requests.get(img_url, allow_redirects=True)
                request_count += 1
                if request_count >= 5:
                  elapsed_time = time.time() - start_time
                  if elapsed_time < 60:
                    wait_time = 60 - elapsed_time
                    print(f"Throttling: Waiting for {wait_time:.2f} seconds...")
                    time.sleep(wait_time)
                  request_count = 0
                  start_time = time.time()

                response.raise_for_status()

                if response.content.startswith(b'\xff\xd8'):
                  ext = 'jpg'
                elif response.content.startswith(b'\x89PNG'):
                  ext = 'png'
                elif response.content.startswith(b'GIF'):
                  ext = 'gif'
                elif response.content.startswith(b'BM'):
                  ext = 'bmp'
                elif response.content.startswith(b'\x00\x00\x01\x00'):
                  ext = 'ico'
                elif response.content.startswith(b'II*\x00') or response.content.startswith(b'MM\x00*'):
                  ext = 'tif'
                elif response.content.startswith(b'RIFF') and b'WEBP' in response.content[8:16]:
                  ext = 'webp'
                else:
                  print("Unknown image format - skipping:", img)
                  images_db[img] = ''
                  continue

                filename = hashlib.sha256(response.content).hexdigest() + "." + ext
                file_path = OUT_DIR+"/images/"+filename
                with open(file_path, 'wb') as file:
                  file.write(response.content)
                  print("Saved:", img, len(images_db))
                images_db[img] = file_path                  
              except requests.exceptions.RequestException as e:
                print(e)
                images_db[img] = ''

images_db = {}
if os.path.exists(OUT_DIR+"_imgdb.pickle"):
  print(f"Loading image database from {OUT_DIR}_imgdb.pickle...")
  with open(OUT_DIR+"_imgdb.pickle", 'rb') as f:
    images_db = pickle.load(f)

if len(images_db)==0 or FORCE_IMAGES: # or... to force "continue" scraping even if we loaded something
  process_all_images()
  print(f"Saving image database to {OUT_DIR}_imgdb.pickle...")
  with open(OUT_DIR+"_imgdb.pickle", 'wb') as f:
    pickle.dump(images_db, f)

for topic in topics: # resolve images URLs to scraped images
  for post in topics[topic]:
    resolved_imgs = [None]*len(post[-1][0]) # we assume they are all the same length
    for img_set in post[-1]: # we have different sets if a given post was scraped multiple times (at different times)
      for i,img in enumerate(img_set):
        if img in images_db and images_db[img]!='': # non-existing mean we could not parse/we did not request, '' means we could not download...
          resolved_imgs[i] = images_db[img] # if scraped, they should all be the same, so it's ok to take the last one
      for i,img in enumerate(resolved_imgs):
        if img is None:
          resolved_imgs[i] = post[-1][0][i] # if not found, keep the original URL
    post[-1] = resolved_imgs



print("GENERATING HTMLs")

def txt_to_html(txt): # deals with <br> and &nbsp...
    # replace multiple spaces or spaces at the beginning of a line
    txt = txt.strip()
    txt = html.escape(txt)
    txt = txt.replace('\n\n','\n'); # remove double newlines
    txt = re.sub('\n[ ]+', lambda m:'<br/>'+('&nbsp'*(len(m.group(0))-1)), txt)
    txt = re.sub(' [ ]+', lambda m:'&nbsp'*len(m.group(0)), txt)
    txt = txt.replace('\t', '&nbsp&nbsp&nbsp&nbsp') # could use &emsp but this seems better for copy-paste?
    txt = txt.replace('\n','<br/>\n'); # replace any other newlines
    return txt

def topic_to_htmlname(s):
    # python hash() is salted w/a random seed - not consistent between sessions
    #i = hash(s)
    #i = i*2 if i>0 else (-i*2+1)
    #return str(i)+".htm"
    return hashlib.sha256(s.encode()).hexdigest()+'.htm'

include_html = '<link rel="icon" type="image/x-icon" href="{web}/favicon.ico">'.format(web=WEBSITE)+"""
<meta name="viewport" content="width=device-width, initial-scale=1.0"><meta charset="UTF-8">

<meta name="description" content="Preserving history."><meta name="author" content="Angelo Pesce">
<meta name="keywords" content="scraped, archive">

<meta http-equiv="content-type" content="text/html; charset=utf-8">
<style>
body { font-family:monospace; font-size:medium; line-height:1.555;
    max-width:140ch; margin:auto; background-color:ghostwhite; color:black; }
h1 { font-weight:bold; font-size:2em; background-color:black; color:white; text-align:left; } 
div { background-color:white; padding-left:1ch; padding-right:1ch; padding-top:0.5ch; padding-bottom:0.5ch; word-wrap:break-word; hyphens:auto; margin-bottom:1em }
#ending { font-weight:bold; text-align:center; background-color:black; color:white; }
a { color:gray; text-decoration:none }
@media (prefers-color-scheme: dark) { /* night mode overrides */
    body { background-color:#333; color:white; }
    h1 { background-color:gray; color:white; } 
    div { background-color:black; }
    #ending { background-color:gray; color:white; }
    a { color:silver }
}
img { max-width:33%; height:auto; border-style:solid; border-width:1px; }
/* Mobile overrides: I used to use @media (pointer:none) or (pointer:coarse) or (hover:none) but that catches tablets too, in the end, it's best to just react to width:  */
@media only screen and (max-width: 120ch) {
  body { font-size:smaller; }
  h1 { font-size:1.5em; } 
}
@media only screen and (max-width: 80ch) {
  body { line-height:1.333; }
}
</style>
"""

page_template = """<!DOCTYPE html>
<html lang="en">
<head>
{include}
<title>{page_title}</title>
</head>
<body>
<h1>{title} <a href="{home_link}">back</a></h1>
{body}
<p id="ending">{footer} <a href="{home_link}">back</a></p>
</body>
</html>
""" # page_title title body home_link footer include

post_template = "<div><a name='p{postnum}'></a><a href='#p{postnum}'>(L)</a> <b>[{date}] [{auth}] [{subject}]</b>{opt}<br/><br/>{body}" # date subject auth body postnum opt

def html_out(topic): 
    out_path = OUT_DIR+'/'+topic_to_htmlname(topic)
    posts = topics[topic]

    html_blocks = []
    if topics_nav[topic].strip()!='':
      html_blocks.append(f"<h2>Board: {topics_nav[topic]}</h2>")

    for (i,post) in enumerate(posts): # (0date, 1author, 2text, 3footer, 4wayback_url, [resolved images...])
        txt = txt_to_html(post[2])
        opttxt = f'<a href="{post[4]}"> Wayback!</a>'

        html_blocks.append( post_template.format(date=post[0].strftime("%Y/%m/%d"), subject=html.escape(topic), auth=html.escape(post[1]), body=txt, postnum=i, opt=opttxt) )
        
        if len(post[-1])>0:
          html_blocks.append("<hr/>")
          for i,img_url in enumerate(post[-1]):
            if img_url.startswith(OUT_DIR+"/images/"): # not great way of checking...
              html_blocks.append(f'[IMG #{i+1}]:<a href="{img_url[len(OUT_DIR)+1:]}"><img src="{img_url[len(OUT_DIR)+1:]}" alt="[IMG:#{i}]"/></a><br/>')
            else:
              html_blocks.append(f'[IMG #{i+1}]:Not scraped: <a href="{img_url}">{img_url}</a><br/>')
        html_blocks.append("</div>")

    htmltxt = page_template.format(page_title=html.escape(topic), title=html.escape(topic), body='\n'.join(html_blocks), home_link="index_main.htm", footer="", include=include_html)

    with open(out_path, 'w', encoding="utf-8") as f:
        f.write(htmltxt)



print(" generating thread pages...")

for t in topics:
    html_out(t)



include_html += """
<style>
b { color:darkcyan }
@media (prefers-color-scheme: dark) { /* night mode overrides */
    b { color:yellow }
}
</style>
""" # kinda a hack...

li_template = '{num:03} [{date}] ({numposts}) <a href="{link}">{subject}</a><br/>' # num date link subject numposts - used to use <ol><li>.... but not necessary!

def html_index_out(topics_sort, pre, titletxt, filename):
    html_blocks = [pre, '<div>']
    for i,topic in enumerate(topics_sort):
        posts = topics[topic] # (0date, 1author, 2text, 3footer, 4wayback_url)

        #dates = [ p[0] for p in posts ]
        datetxt = posts[0][0].strftime("%Y/%m/%d") #min(dates).strftime("%Y/%m/%d")
        numtxt = str(len(posts)) # .zfill(4)
        subjtxt = html.escape(topic)
        html_blocks.append( li_template.format(num=i, date=datetxt, numposts=numtxt, link=topic_to_htmlname(topic), subject=subjtxt) )
    html_blocks.append('</div>')
    htmltxt = page_template.format(page_title=titletxt, title=titletxt, body='\n'.join(html_blocks), home_link=WEBSITE, footer="", include=include_html)

    out_path = OUT_DIR+'/'+filename
    with open(out_path, 'w', encoding="utf-8") as f:
        f.write(htmltxt)

print(" generating indices...")

print("Found", len([ k for k in topics if len(topics[k])==0 ]), "empty topics")

topics_sort = [ k for k in topics if len(topics[k])!=0 ]
topics_sort.sort( key = lambda k:topics[k][0][0] )

html_index_out(topics_sort, PAGE_INTRO, PAGE_TITLE, "index_main.htm")

# bonus...
#from collections import Counter

#def html_topN(topN):
#    html_blocks = ['Top posters.<br/><br/>', '<div><ol>']
#    for tt in topN:
#        html_blocks.append( '<li>({count}) {txt}</li>'.format(count=tt[1], txt=html.escape(tt[0])) )
#    html_blocks.append('</ol></div>')
#    htmltxt = page_template.format(page_title='Hall of fame', title='Hall of fame', body='\n'.join(html_blocks), home_link="index_main.htm", footer="", include=include_html)

#    out_path = FOLDER+'/'+OUT_DIR+'/hof.htm'
#    with open(out_path, 'w', encoding="utf-8") as f:
#        f.write(htmltxt)

#AUTHS = [ [pp['from'] for pp in MERGED_DB[p][1:]] for p in MERGED_DB ]
#AUTHS = [item for sublist in AUTHS for item in sublist]
#html_topN(Counter(AUTHS).most_common(1000))

print("done")