#!/usr/bin/env python3 # v00001 - Angelo Pesce # scrape, parse, generate HTML... # It currently scrapes all versions (timestamps) of all images (very slow...) - which is probably not necessary - didn't check # - relies on content hashing to remove duplicates and then "resolves" to only one version of image per post (if any was scraped) # Post de-duplication relies on hashing the text content of the post - this is not perfect and very sensitive to changes in the forum settings # during time, and to things that naturally change in posts - like image view counts or signatures etc... import requests from bs4 import BeautifulSoup import time import pickle import os from datetime import datetime import html import re import hashlib FORCE_RESCRAPE = False # set to True to force rescraping/additional scraping of the forum pages FORCE_REPARSE = False # set to True to force reparsing of the scraped pages FORCE_IMAGES = False # set to True to force rescraping/additional scraping of the images WEBSITE = 'http://c0de517e.com' if False: OUT_DIR = 'ompf' # also used for the pickle filename SCRAPE_URL_PREFIX = "ompf.org/forum/viewtopic.php" # prefix PAGE_INTRO = '
Preserving the history of the ompf.org forum. Threads scraped from wayback machine.
' PAGE_TITLE = 'OMPF Forum.' else: OUT_DIR = 'ompf2' # also used for the pickle filename SCRAPE_URL_PREFIX = "ompf2.com/viewtopic.php" # prefix PAGE_INTRO = '
Preserving the history of the ompf2.com forum. Threads scraped from wayback machine.
' PAGE_TITLE = 'OMPF2 Forum.' if not os.path.exists(OUT_DIR): os.mkdir(OUT_DIR) if not os.path.exists(OUT_DIR+"/images"): os.mkdir(OUT_DIR+"/images") def get_wayback_urls(url): """Fetches archived URLs with a 200 status code from Wayback Machine's CDX API.""" api_url = f"https://web.archive.org/cdx/search/cdx?url={url}/*&output=json&filter=statuscode:200&limit=20000" response = requests.get(api_url) data = response.json() return data[1:] # Remove header def scrape_archived_pages(archived_urls, scraped_pages): """ Scrapes archived pages and stores them in a dictionary, respecting throttling limits. """ print("Scraping archived pages...") request_count = 0 start_time = time.time() for url_data in archived_urls: digest = url_data[5] if digest in scraped_pages: print(f"Skipping duplicate/already seen digest: {digest}") continue timestamp = url_data[1] original_url = url_data[2] wayback_url = f"https://web.archive.org/web/{timestamp}/{original_url}" try: response = requests.get(wayback_url) response.raise_for_status() scraped_pages[digest] = (original_url, timestamp, response.text) print(f"Successfully scraped: {wayback_url}", len(scraped_pages)) request_count += 1 if request_count >= 5: # Reduced rate limit elapsed_time = time.time() - start_time if elapsed_time < 60: wait_time = 60 - elapsed_time print(f"Throttling: Waiting for {wait_time:.2f} seconds...") time.sleep(wait_time) request_count = 0 start_time = time.time() except requests.exceptions.RequestException as e: print(f"Error scraping {wayback_url}: {e}") return scraped_pages def process_post_tag(tag): # parse some of the tags in the post text for br in tag.find_all('br'): br.replace_with('\n') for cite in tag.find_all('cite'): # processing this before a[href] to avoid processing links in the cite text cite.replace_with(cite.get_text()) for a in tag.find_all('a'): if a.findChild('img'): # skip links that contains images continue link = a['href'] if 'web.archive.org' in link: link = link.split('/', 5)[-1] # Extract the original URL part - this is important also to properly deduplicate posts later on! if link!=a.get_text() and not a.get_text().startswith('http'): # if the link text is the same as the link, we don't need to show it... a.replace_with(f"[LINK {link} {a.get_text()}]") else: a.replace_with(f"[LINK {link}]") for quote in tag.find_all('blockquote'): quote.replace_with(f' >> {quote.get_text()}') for dt in tag.find_all('dt'): if 'class' in dt.attrs and dt['class'][-1] == 'attach-image': for dd in dt.parent.find_all('dd'): # there can be multiple dd tags, and the view-count one might not be the first one... if 'Viewed' in dd.get_text() and 'times' in dd.get_text(): dd.replace_with("") # We need to remove the "Viewed N times" tag otherwise it will confuse de-duplication. imgs = [] for frame in tag.find_all('iframe'): frame.replace_with(f"[IFRAME n/a]") for img in tag.find_all('img'): if ('class' in img.attrs and (img['class'][-1]=='smilies' or img['class'][-1]=='smiles')) or ('/smilies/' in img['src']) or ('/smiles/' in img['src']): # skip smilies img.replace_with(f"[SMILEY {img.get('alt','-')}]") else: imgs.append(img['src']) img.replace_with(f"[IMG #{len(imgs)} {img.get('alt',img.get('title','?'))}]") return (tag.get_text(), imgs) def extract_post_data(scraped_pages): """Extracts post data and topic title from scraped HTML, handling different formats.""" extracted_data = {} for digest, (page_url, date, html_text) in scraped_pages.items(): try: soup = BeautifulSoup(html_text, 'html.parser') wayback_url = f"https://web.archive.org/web/{date}/{page_url}" # Reconstruct Wayback URL # Try to extract data in the first format (older) topic_title_tag = soup.find('a', class_='maintitle') if topic_title_tag: topic_title = topic_title_tag.get_text() topic_nav_tag = soup.find('span', class_='nav').get_text() posts = [] spans = soup.find_all('span') i = 0 while i < len(spans): if spans[i].get('class') == ['name']: post_data = [spans[i].get_text()] i += 1 if i < len(spans) and spans[i].get('class') == ['postdetails']: post_data.append(spans[i].get_text()) i += 1 assert i < len(spans) and spans[i].get('class') == ['postdetails'] post_details2 = spans[i].get_text() # Extract date using strptime, ending before "Post subject:" post_date_str = post_details2.split("Posted:")[1].split("Post subject:")[0].strip() post_date = datetime.strptime(post_date_str, "%a %b %d, %Y %I:%M %p") post_data.append(post_details2) # Add second postdetails i += 2 # Increment i by 2 to skip the extra span assert i < len(spans) and spans[i].get('class') == ['postbody'] text,imgs = process_post_tag(spans[i]) post_data.append(text) post_data.append(post_date) # Add datetime object to post_data post_data.append(imgs) posts.append(post_data) i += 1 else: i += 1 extracted_data[digest] = (wayback_url, topic_title, topic_nav_tag, posts, "format1") print(f"Successfully extracted data (format 1) from: {wayback_url} - {topic_title}") else: # Try to extract data in the second format topic_title_tag = soup.find('h3', class_='first') if topic_title_tag: topic_title = topic_title_tag.get_text() topic_nav_tag = soup.find('li', class_='icon-home') if topic_nav_tag: topic_nav_tag = topic_nav_tag.get_text() else: topic_nav_tag = soup.find('li', class_='breadcrumbs').get_text() # alternative nav tag posts = [] for p_tag in soup.find_all('p', class_='author'): author_text = p_tag.get_text() # Extract date using strptime, handling different delimiters and formats if "»" in author_text: post_date_str = author_text.split("»")[1].strip() elif "on" in author_text: post_date_str = author_text.split("on")[1].strip() else: assert False if post_date_str: try: post_date = datetime.strptime(post_date_str, "%d %b %Y, %H:%M") except ValueError: try: post_date = datetime.strptime(post_date_str, "%a %b %d, %Y %I:%M %p") except ValueError: assert False post_data = [author_text] div_tag = p_tag.find_next_sibling('div', class_='content') assert div_tag text,imgs = process_post_tag(div_tag) post_data.append(text) dl_tags = div_tag.find_all_next('dl', class_='postprofile') if dl_tags: post_data.append(dl_tags[0].get_text()) else: post_data.append('') post_data.append(post_date) # Add datetime object to post_data post_data.append(imgs) posts.append(post_data) extracted_data[digest] = (wayback_url, topic_title, topic_nav_tag, posts, "format2") print(f"Successfully extracted data (format 2) from: {wayback_url} - {topic_title}") else: print(f"Could not extract data from: {wayback_url}") except Exception as e: print(f"Error extracting data from {wayback_url}: {e}") return extracted_data scraped_data = {} if os.path.exists(OUT_DIR+".pickle"): print(f"Loading scraped pages from {OUT_DIR}.pickle...") with open(OUT_DIR+".pickle", 'rb') as f: scraped_data = pickle.load(f) if len(scraped_data)==0 or FORCE_RESCRAPE: archived_urls = get_wayback_urls(SCRAPE_URL_PREFIX) print("Found",len(archived_urls),"pages to scrape.") scrape_archived_pages(archived_urls, scraped_data) print(f"Saving scraped pages to {OUT_DIR}.pickle...") with open(OUT_DIR+".pickle", 'wb') as f: pickle.dump(scraped_data, f) # to test a single page: #page = next((d for d in scraped_data if scraped_data[d][0]=='http://ompf2.com/viewtopic.php?f=5&t=2132&sid=be66a3c1c13156d6289be707ce3ae3d9')) #extract_post_data({page:scraped_data[page]}) extracted_data = [] if not FORCE_REPARSE and os.path.exists(OUT_DIR+"_extracted.pickle"): # we pickle this as well as it take time to generate, impeding fast iteration on the script... print(f"Loading extracted data from {OUT_DIR}_extracted.pickle...") with open(OUT_DIR+"_extracted.pickle", 'rb') as f: extracted_data = pickle.load(f) if len(extracted_data)==0 or FORCE_REPARSE: extracted_data = extract_post_data(scraped_data) print(f"Saving extracted data to {OUT_DIR}_extracted.pickle...") with open(OUT_DIR+"_extracted.pickle", 'wb') as f: pickle.dump(extracted_data, f) # Print the number of pages with extracted data print(f"\nExtracted data from {len(extracted_data)} pages.") data = list(extracted_data.values()) # format2: 0wayback_url, 1topic, 2topic_nav, 3[ ['by _author_ » date', post_text, footer, datetime, imgs] ... ], 4'format2' # format1: 0wayback_url, 1topic, 2topic_nav, 3[ [author, author_data, footer, post_text, datetime, imgs] ... ], 4'format1' topics = { page[1]:[] for page in data } # topic -> [ (date, author, text, footer, wayback_url, [[images...],...]) ... ] topics_nav = {} posts_already_seen = {} for page in data: wayback_url = page[0] topic = page[1] topics_nav[topic] = page[2] # save for later :) format = page[4] for post in page[3]: date = post[-2] imgs = post[-1] if format=='format2': author = post[0] text = post[1] footer = post[2].strip() if ' » ' in author: author = author.split(' » ')[0][3:] # skip "by " else: author = author.split(' on ')[0][3:] # skip "by " else: # format 1 (older) author = post[0] text = post[3] footer = post[1]+post[2] filtered_text = ''.join(text.split()) # again, ignore whitespace if '_________________' in filtered_text: filtered_text_split = filtered_text.split('_________________') if len(filtered_text_split)==2: filtered_text = filtered_text_split[0] # In "format1" forum posts there can be a signature embedded in the text - which can change and cause de-duplication to fail # SMILEY format can also change between forum versions/settings :/ filtered_text = re.sub(r'\[SMILEY [^\]]*\]', '', filtered_text) #filtered_text = re.sub(r'\[LINK [^\]]*\]', '', filtered_text) #filtered_text = re.sub(r'\[IMG [^\]]*\]', '', filtered_text) phash = hash(filtered_text) if not phash in posts_already_seen: # filter out duplicates - which happens if the archive snapshotted the same page at different times... pt = [date, author, text, footer, wayback_url, [imgs]] topics[topic].append( pt ) posts_already_seen[phash] = pt[-1] # the image array else: posts_already_seen[phash].append(imgs) # keep images in duplicated posts, because they are from different times / have different wayback URLs. print('s',end='') for topic in topics: topics[topic].sort( key=lambda p:p[0] ) print("PROCESSING IMAGES") def process_all_images(): request_count = 0 start_time = time.time() for topic in topics: for post in topics[topic]: for img_set in post[-1]: for img in img_set: if img not in images_db: if not(img.startswith("http://") or img.startswith("https://")): img_url = "https://web.archive.org"+img # fix relative URLs #print("skipping relative URL:", img) # this should not happen... continue else: img_url = img #ext = img.split('.')[-1].lower() #if not('png' in ext or 'jpg' in ext or 'jpeg' in ext or 'gif' in ext): # print("unknown - skipping:", img) # continue try: response = requests.get(img_url, allow_redirects=True) request_count += 1 if request_count >= 5: elapsed_time = time.time() - start_time if elapsed_time < 60: wait_time = 60 - elapsed_time print(f"Throttling: Waiting for {wait_time:.2f} seconds...") time.sleep(wait_time) request_count = 0 start_time = time.time() response.raise_for_status() if response.content.startswith(b'\xff\xd8'): ext = 'jpg' elif response.content.startswith(b'\x89PNG'): ext = 'png' elif response.content.startswith(b'GIF'): ext = 'gif' elif response.content.startswith(b'BM'): ext = 'bmp' elif response.content.startswith(b'\x00\x00\x01\x00'): ext = 'ico' elif response.content.startswith(b'II*\x00') or response.content.startswith(b'MM\x00*'): ext = 'tif' elif response.content.startswith(b'RIFF') and b'WEBP' in response.content[8:16]: ext = 'webp' else: print("Unknown image format - skipping:", img) images_db[img] = '' continue filename = hashlib.sha256(response.content).hexdigest() + "." + ext file_path = OUT_DIR+"/images/"+filename with open(file_path, 'wb') as file: file.write(response.content) print("Saved:", img, len(images_db)) images_db[img] = file_path except requests.exceptions.RequestException as e: print(e) images_db[img] = '' images_db = {} if os.path.exists(OUT_DIR+"_imgdb.pickle"): print(f"Loading image database from {OUT_DIR}_imgdb.pickle...") with open(OUT_DIR+"_imgdb.pickle", 'rb') as f: images_db = pickle.load(f) if len(images_db)==0 or FORCE_IMAGES: # or... to force "continue" scraping even if we loaded something process_all_images() print(f"Saving image database to {OUT_DIR}_imgdb.pickle...") with open(OUT_DIR+"_imgdb.pickle", 'wb') as f: pickle.dump(images_db, f) for topic in topics: # resolve images URLs to scraped images for post in topics[topic]: resolved_imgs = [None]*len(post[-1][0]) # we assume they are all the same length for img_set in post[-1]: # we have different sets if a given post was scraped multiple times (at different times) for i,img in enumerate(img_set): if img in images_db and images_db[img]!='': # non-existing mean we could not parse/we did not request, '' means we could not download... resolved_imgs[i] = images_db[img] # if scraped, they should all be the same, so it's ok to take the last one for i,img in enumerate(resolved_imgs): if img is None: resolved_imgs[i] = post[-1][0][i] # if not found, keep the original URL post[-1] = resolved_imgs print("GENERATING HTMLs") def txt_to_html(txt): # deals with
and ... # replace multiple spaces or spaces at the beginning of a line txt = txt.strip() txt = html.escape(txt) txt = txt.replace('\n\n','\n'); # remove double newlines txt = re.sub('\n[ ]+', lambda m:'
'+(' '*(len(m.group(0))-1)), txt) txt = re.sub(' [ ]+', lambda m:' '*len(m.group(0)), txt) txt = txt.replace('\t', ' ') # could use &emsp but this seems better for copy-paste? txt = txt.replace('\n','
\n'); # replace any other newlines return txt def topic_to_htmlname(s): # python hash() is salted w/a random seed - not consistent between sessions #i = hash(s) #i = i*2 if i>0 else (-i*2+1) #return str(i)+".htm" return hashlib.sha256(s.encode()).hexdigest()+'.htm' include_html = ''.format(web=WEBSITE)+""" """ page_template = """ {include} {page_title}

{title} back

{body}

{footer} back

""" # page_title title body home_link footer include post_template = "

(L) [{date}] [{auth}] [{subject}]{opt}

{body}" # date subject auth body postnum opt def html_out(topic): out_path = OUT_DIR+'/'+topic_to_htmlname(topic) posts = topics[topic] html_blocks = [] if topics_nav[topic].strip()!='': html_blocks.append(f"

Board: {topics_nav[topic]}

") for (i,post) in enumerate(posts): # (0date, 1author, 2text, 3footer, 4wayback_url, [resolved images...]) txt = txt_to_html(post[2]) opttxt = f' Wayback!' html_blocks.append( post_template.format(date=post[0].strftime("%Y/%m/%d"), subject=html.escape(topic), auth=html.escape(post[1]), body=txt, postnum=i, opt=opttxt) ) if len(post[-1])>0: html_blocks.append("

") for i,img_url in enumerate(post[-1]): if img_url.startswith(OUT_DIR+"/images/"): # not great way of checking... html_blocks.append(f'[IMG #{i+1}]:

') else: html_blocks.append(f'[IMG #{i+1}]:Not scraped: {img_url}
') html_blocks.append("

") htmltxt = page_template.format(page_title=html.escape(topic), title=html.escape(topic), body='\n'.join(html_blocks), home_link="index_main.htm", footer="", include=include_html) with open(out_path, 'w', encoding="utf-8") as f: f.write(htmltxt) print(" generating thread pages...") for t in topics: html_out(t) include_html += """ """ # kinda a hack... li_template = '{num:03} [{date}] ({numposts}) {subject}
' # num date link subject numposts - used to use

.... but not necessary! def html_index_out(topics_sort, pre, titletxt, filename): html_blocks = [pre, '
'] for i,topic in enumerate(topics_sort): posts = topics[topic] # (0date, 1author, 2text, 3footer, 4wayback_url) #dates = [ p[0] for p in posts ] datetxt = posts[0][0].strftime("%Y/%m/%d") #min(dates).strftime("%Y/%m/%d") numtxt = str(len(posts)) # .zfill(4) subjtxt = html.escape(topic) html_blocks.append( li_template.format(num=i, date=datetxt, numposts=numtxt, link=topic_to_htmlname(topic), subject=subjtxt) ) html_blocks.append('
') htmltxt = page_template.format(page_title=titletxt, title=titletxt, body='\n'.join(html_blocks), home_link=WEBSITE, footer="", include=include_html) out_path = OUT_DIR+'/'+filename with open(out_path, 'w', encoding="utf-8") as f: f.write(htmltxt) print(" generating indices...") print("Found", len([ k for k in topics if len(topics[k])==0 ]), "empty topics") topics_sort = [ k for k in topics if len(topics[k])!=0 ] topics_sort.sort( key = lambda k:topics[k][0][0] ) html_index_out(topics_sort, PAGE_INTRO, PAGE_TITLE, "index_main.htm") # bonus... #from collections import Counter #def html_topN(topN): # html_blocks = ['Top posters.

', '
1. ({count}) {txt}
') # htmltxt = page_template.format(page_title='Hall of fame', title='Hall of fame', body='\n'.join(html_blocks), home_link="index_main.htm", footer="", include=include_html) # out_path = FOLDER+'/'+OUT_DIR+'/hof.htm' # with open(out_path, 'w', encoding="utf-8") as f: # f.write(htmltxt) #AUTHS = [ [pp['from'] for pp in MERGED_DB[p][1:]] for p in MERGED_DB ] #AUTHS = [item for sublist in AUTHS for item in sublist] #html_topN(Counter(AUTHS).most_common(1000)) print("done")