#!/usr/bin/env python3
# on unix-ish, can execute from the shell... ./txt2web.py

# ----
# c0de517e's web generator: hardcoded - not a framework - also, not good python code - sorry :)
# luckily all the website "inputs" are designed to be so braindead simple that it would not be a problem to rewrite this whole thing...
#
# NOTE - I HAD A FEW PEOPLE ASK IF THEY COULD USE THIS FOR THEIR OWN BLOGS...
#  Yes - you can, but I advise not to? It's ugly... The main takeaway here is that you CAN write your own generator
#  or even just write PLAIN HTML with some CSS to make things "pretty" - and it can be easier/better than existing
#  static website generators which, having to be flexible and able to deal with complicated stuff, tend to take a lot
#  to learn and use. That said, if you really insist in using this script as your base - feel free. Really by now
#  my blog is not generated by a single script - I have separate ones for the "journal" and "links" sections and to automate
#  uploads - I can send them to you if you like, or (which is probably better, as they are very specific to my hosting setup)
#  just strip the functionality out of this script...
# ----

txt2web_version = 0.2 # LAST CHANGE: Added "links" feature / subpage! Other small refactoring, fixes...

# FEATURES:
# - Scans all .txt file in a folder, generates html for them and an index (homepage).
# -- index is a bit special, has a separate style
# -- .txt files starting with _ won't be indexed (but can be linked from other pages via [FILE])
# -- generates human-readable html
# -- generates RSS feed
# -- validates links, images...
# -- generates image thumbnails

# - Simple, but nice:
# -- website works from local files, does not need to spin a webserver to preview
# -- decent, minimalistic style, responsive (mobile...), respects day/night modes
# -- custom fonts, monospace-ish (I realized I did not really need monospace, also I like justified text)
# -- old browser/text browser friendly, no JS
# -- FAST page loads, CSS is inline, image (max-)size is in the tags.
# -- database to keep track of modifications

# - TXT not markdown: preserving formatting, only few special tags supported:
# -- [TODO:note] causes a warning / reminder
# -- [NOTE:note] hides the note (comment)
# -- [IMG:image optional_text] image has to be in a folder with the same name as the txt file, will be centered/with caption if the tag is on its own line
# -- [FILE:file optional_text] local link to file/other page - searches first the folder with the same name as the txt file, then the root
# -- [LINK:url optional_text] only for external, web links, checks that the link resolves w/o error
# -- ** text ** bold, ala Markdown
# -- blocks surrounded by three square [[[brackets]]] = quoting, do not parse the other tags within the block & use monospace font 
# - NOTE: tags do NOT resolve recursively! - can't use tags inside tags
# - NOTE: tags do (now) support spaces after the ':' - but should prefer not to (e.g. [FILE: text.txt test] should be ok)

# - Website sub-sections:
# -- Journal, for shorter / more random posts
#  This is really a separate system that is "injected" here - see journal_fetch.py
# -- New! "Links" section
#  Similar to the Journal one, but different - see journal_fetch_links.py
#
# Note: the website currently has also other sections and special pages - but these are not handled by the txt2web system
# instead they were generated by other scripts or ad-hoc systems, placed in the 'external' folder - and linked from other
# pages (typically, the homepage via index.txt)

# TODO - BUGS
# - Fix the problem where an IMG that was incorrectly declared as .png in the tag (...and is a .jpg instead) will not generate an error because the THUMBS have the .png and the check is done there...
# - favicon "fixes" - https://realfavicongenerator.net/favicon_checker?protocol=http&site=c0de517e.com
# - Should have used capture groups in the tag-parsing regex - instead of splitting by hand after a match
# - Could be MUCH FASTER: avoid string + / += (use str.format(...) or "".join(array)) - avoid doing multiple passes... use StringIO streams...
#
# see TODO.md for feature ideas...

# NOTE - for VScode/Anaconda/Windows
#  Might need to switch the terminal to cmd.exe (command palette -> terminal: set default profile -> command prompt)
#  https://stackoverflow.com/questions/54828713/working-with-anaconda-in-visual-studio-code - I don't need this anymore as I now
#  develop the blog via VScode remote into a linux box.



import re
import os
import pytz
from datetime import datetime
import pickle
from urllib import parse
import requests
import locale
import hashlib
import html
import argparse
import uuid
import glob

from PIL import Image, ImageOps, ImageEnhance # requires PIL/Pillow (>=10.0.0 I think)



# Configuration file? Command line arguments? nah, just edit the script...

WEBURL = 'http://c0de517e.com'
FOLDER ='.'; os.chdir(os.path.dirname(os.path.abspath(__file__))) # Folder to process - setting it as the one containing this script

INDEX_NAME = 'index.txt' # Name of the .txt file that we'll use for the home
HOME = '/index.htm' # URL of the home page, relative to WEBURL

OFFLINE_MODE = False # Disable link validation
FORCE_UPDATE = False # Needed when changing HTML generation / templates - in practice it should never hurt, even if it takes longer ofc... Also, forces re-check of web links!
FORCE_GEN_THUMBS = False # Needed when changing image_max_width or gen_thumbs implementation

DO_NOT_UPDATE_DB = OFFLINE_MODE # we don't know how to generate links in offline_mode, so we can't save the DB, we need to be able to re-generate the pages when we are back online
REMOVE_TXT_NOT_FOUND = True # purge the database from old .txt files that are not currently present in the folder

EXTERNAL_FOLDER_IGNORE = 'EXTERNAL' # Avoids checking 'FILE:' links that start with this - as we don't keep big files locally, only on the web server...
IMAGE_MAX_WIDTH = 350
THUMB_SUBFOLDER = 'THUMBS' # The thumbnails will be generates in this subfolder of the .txt "data" folder (test.txt -> test/THUMBS/...)
RSS_MAX_ITEMS = 32
USE_FILEDATE = True # Should not change once a website is up and running - if you want to change, you need to patch the DB's last_time/orig_time manually



try:
    locale.setlocale(locale.LC_ALL, 'en_US') # affects strftime etc, make sure it does not change by mistake if we run this script on different computers...
except:
    locale.setlocale(locale.LC_ALL, 'en_US.utf8') # on linux systems, use locale -a to see the installed locales
PTz = pytz.timezone("America/Los_Angeles") # "US/Pacific"

# Could/shuld simply import - but this adds directly to the global namespace so I don't need to change the code
# and it has some advantages when modifying and reloading...
with open("txt2web_template_str.py", 'r') as f:
    exec(f.read())
with open("journal_common.py", 'r') as f: 
    exec(f.read())

# Ok - I guess there is one command-line argument now :)
PARSER = argparse.ArgumentParser()
PARSER.add_argument( "--links", action='store_true')
PARSEDARGS = PARSER.parse_args()

LINKS_PREPASS_MODE = PARSEDARGS.links # If True, no Journal processing, no RSS...

if LINKS_PREPASS_MODE: # 'patch' global settings
    FOLDER = os.path.join(FOLDER, JOURNAL_LINKS_DIR)
    if not os.path.exists(FOLDER):
        print("ERROR: subdir does not exist:", FOLDER)
        exit(1)

    

class tty_c: # ansi escape sequences, should work on most platforms and terminals
    BLUE = '\033[94m'
    CYAN = '\033[96m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    ENDC = '\033[0m' # reset

def check_websafe(str):
    return str == parse.quote(str)



# resolve the few "smart tags we support in the .txt - at this point only the title has been parsed/removed
# this is really crappy code - it's both slow, as it goes over the text multiple tiles, and stupid, as it
# does not warn about malformed tags (e.g. missing closing ] etc...), does not support spaces after the tag etc
def txt_parse_tags(txt, data_folder):
    tag_escape_iter = re.finditer(r"\[\[\[|\]\]\]", txt) # ... https://regex101.com/
    txt_blocks = []
    has_warnings = False

    block_open = False
    block_last = 0 # NOTE: not able to nest quotes... that would require a counter...
    for tag in tag_escape_iter:
        tag_txt = tag.group()
        span = tag.span()
        if (block_open and tag_txt=='[[[') or ((not block_open) and tag_txt==']]]'):
            print(tty_c.RED,"ERROR: mismatched quote around: ...",txt[max(0,span[0]-16) : span[0]+3].replace('\n','\\n'),tty_c.ENDC)
            return None
        block_text = txt[block_last:span[0]]
        block_text = '<p id="quoted_body_p">{t}</p>'.format(t=block_text) if block_open else block_text 
        txt_blocks.append((block_text, not block_open)) # (block text, does block need tag processing)
        block_open = False if block_open else True
        block_last = span[1]
    txt_blocks.append( (txt[block_last:],True) )
    
    newtxt_blocks = []
    for block in txt_blocks:
        if block[1] and len(block[0])>0:
            txt = block[0]

            # "todo" tags - if found, we show a warning
            tag_todo = re.findall(r"\[TODO:[^\]]*\]", txt)
            if tag_todo:
                for tag in tag_todo:
                    print(tty_c.YELLOW,"WARNING - [TODO:] left:"+tag[6:-1],tty_c.ENDC)
                has_warnings = True
        
            # "note" tags - hide
            txt = re.sub(r"\[NOTE:[^\]]*\]\n?", "", txt)

            # ** BOLD **
            tag_link_iter = re.finditer(r"\*\*.*?\*\*", txt)
            match_list = []
            for tag in tag_link_iter:
                match_list.append( (tag.span(), tag.group()[2:-2]) )
            if len(match_list)!=0:
                newtxt = ""
                for tag in reversed(match_list):
                    newtxt = "".join(('<b>', tag[1], '</b>', txt[tag[0][1]:], newtxt))
                    txt = txt[:tag[0][0]]
                txt = txt+newtxt
            
            # external (web) links, we check if they are valid!
            tag_link_iter = re.finditer(r"\[LINK:[^\]]*\]", txt)
            match_list = []
            for tag in tag_link_iter:
                tag_parsed = tag.group()[6:-1].split(maxsplit=1)
                tag_url = tag_parsed[0]
                HEADERS={"Accept":"text/html",
                         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
                if not OFFLINE_MODE:
                    if tag_url[:7].lower()!="http://" and tag_url[:8].lower()!="https://":
                        tag_url = "https://"+tag_url # add protocol to naked URLs
                    try:
                        req = requests.get(tag_url, headers=HEADERS)
                    except:
                        try:
                            req = requests.head(tag_url)
                        except:
                            req = None
                    if req==None or not req.ok:
                        if req!=None:
                            if req.status_code<500: # 429 - too many requests, I get from instagram... In general: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
                                print(tty_c.YELLOW,"WARNING - link:",tag_url,"Status:",req.status_code,req.reason,tty_c.ENDC)
                            else:
                                print(tty_c.RED,"WARNING - bad link:",tag_url,"Status:",req.status_code,req.reason,tty_c.ENDC)
                        else:
                            print(tty_c.RED,"WARNING - bad link:",tag_url,tty_c.ENDC)
                tag_str = tag_url if len(tag_parsed)==1 else tag_parsed[1].strip()
                match_list.append( (tag.span(), tag_url, tag_str) )
            if len(match_list)!=0:
                newtxt = ""
                for tag in reversed(match_list):
                    span = tag[0]
                    newtxt = "".join(('<a href="', tag[1], '">', tag[2], '</a>', txt[span[1]:], newtxt))
                    txt = txt[:span[0]]
                txt = txt+newtxt

            # local images (we want all assets to be local), need to load them to get w/h
            tag_img_iter = re.finditer(r"\[IMG:[^\]]*\]", txt)
            match_list = []
            for tag in tag_img_iter:
                tag_parsed = tag.group()[5:-1].split(maxsplit=1)
                tag_img = tag_parsed[0]
                if not check_websafe(tag_img):
                    print(tty_c.RED,"ERROR - image name is not websafe:",tag_img,tty_c.ENDC)
                    return None
                try:
                    thumb_path = data_folder+'/'+THUMB_SUBFOLDER+'/'+tag_img[:-3]+'png' # thumbs are always PNG
                    img_path = data_folder+'/'+tag_img
                    if not os.path.exists(FOLDER+'/'+thumb_path):
                        thumb_path = img_path
                    with Image.open(FOLDER+'/'+thumb_path) as img:
                        img_size = img.size
                except Exception as e:
                    print(tty_c.RED,'ERROR - Failed to load:',tag_img,str(e),tty_c.ENDC)
                    return None
                tag_str = "" if len(tag_parsed)==1 else tag_parsed[1].strip()
                span = tag.span()
                is_tag_on_its_own_line = False
                if (span[0]==0 or txt[span[0]-1] == '\n') and (span[1]==len(txt) or txt[span[1]] == '\n'): # NOTE/TODO: this is sensitive to trailing spaces (second test)...
                    is_tag_on_its_own_line = True
                match_list.append( (span, thumb_path, tag_str, img_size, is_tag_on_its_own_line, img_path) )
            if len(match_list)!=0:
                newtxt = ""
                for tag in reversed(match_list):
                    span = tag[0]
                    tag_size = tag[3]
                    tag_size = (min(tag_size[0],IMAGE_MAX_WIDTH), tag_size[1])
                    if tag[4]: # if the img tag is on its own line, then we center it
                        if tag[2]!="": # has text & it's centered - put text in both alt and as a caption - NOTE: could use figure/figcaption
                            substr = '<center><a href="{href}" aria-label="link to large image"><img src="{src}" width="{width}" max-height="{height}" alt="{alt}"/></a></br><small>{alt}</small></center>'
                        else:
                            substr = '<center><a href="{href}" aria-label="link to large image"><img src="{src}" width="{width}" max-height="{height}"/></a></center>'
                    else:
                        if tag[2]!="": # has text - put it in the alt
                            substr = '<a href="{href}" aria-label="link to large image"><img src="{src}" width="{width}" max-height="{height}" alt="{alt}"/></a>'
                        else:
                            substr = '<a href="{href}" aria-label="link to large image"><img src="{src}" width="{width}" max-height="{height}""/></a>'
                    substr = substr.format(href=tag[5], src=tag[1], width=tag_size[0], height=tag_size[1], alt=tag[2])
                    remaining_txt = txt[span[1]:]
                    if tag[4] and len(remaining_txt)!=0 and remaining_txt[0]=='\n':
                        remaining_txt=remaining_txt[1:] # I'm actually not entirely sure why we generate a <br> that we don't want on centered images... but this fixes it :)
                    newtxt = "".join((substr, remaining_txt, newtxt))
                    txt = txt[:span[0]]
                txt = txt+newtxt

            # local files/pages (we want all assets to be local)
            tag_file_iter = re.finditer(r"\[FILE:[^\]]*\]", txt)
            match_list = []
            for tag in tag_file_iter:
                tag_parsed = tag.group()[6:-1].split(maxsplit=1)
                tag_file = tag_parsed[0]
                tag_str = tag_file if len(tag_parsed)==1 else tag_parsed[1].strip()
                if not check_websafe(tag_file):
                    print(tty_c.RED,"ERROR - file name is not websafe:",tag_file,tty_c.ENDC)
                    return None
                if tag_file[:len(EXTERNAL_FOLDER_IGNORE)] != EXTERNAL_FOLDER_IGNORE:
                    if os.path.exists(FOLDER+'/'+data_folder+'/'+tag_file): # check data folder first
                        tag_file = data_folder+'/'+tag_file # if found "patch" the link
                    elif not os.path.exists(FOLDER+'/'+tag_file): # otherwise, try from root...
                        print(tty_c.RED,"ERROR - file not found:",tag_file,tty_c.ENDC)
                        return None  
                match_list.append( (tag.span(), tag_file, tag_str) )
            if len(match_list)!=0:
                newtxt = ""
                for tag in reversed(match_list):
                    span = tag[0]
                    newtxt = "".join(('<a href="', tag[1], '">', tag[2], '</a>', txt[span[1]:],newtxt))
                    txt = txt[:span[0]]
                txt = txt+newtxt
            newtxt_blocks.append(txt)
        else:
            newtxt_blocks.append(block[0])

    return ("".join(newtxt_blocks), has_warnings)

# will fill: "page_title" "title" "subtitle" "body" "home_link" "footer" "include" in the template string
# it's not just plain text to html, it expect the .txt to have a given structure...
def txt_to_html(txt, template, txt_footer, data_folder, additional_html, include_head): # mostly deals with <br> and &nbsp...
    if txt.find('’')!=-1 or txt.find('“')!=-1 or txt.find('”')!=-1:
        print(tty_c.YELLOW,"WARNING: fancy characters found (’“”)",tty_c.ENDC)

    # Extract the title (the first line of text)
    #title_match = re.search('\s*[^\n]*\n',txt)
    #txt_title = title_match.group(0).strip() 
    #txt = txt[title_match.span()[1]:].lstrip()
    newline_idx = txt.find('\n')
    txt_title = txt[:newline_idx].strip()
    txt = txt[newline_idx+1:] # +1 to remove the \n
    print(" Title:",txt_title)

    newline_idx = txt.find('\n') # again, for the subtitle
    txt_subtitle = txt[:newline_idx].strip()
    if len(txt_subtitle)!=0: # The idea is that there should be an empty line between title/subtitle and text - if we found it here
        print(" Subtitle found:",txt_subtitle)
        txt = txt[newline_idx+1:] # +1 to remove the \n
    txt=txt.lstrip() # remove any empty line or other whitespace between the end of the title/subtitle and text

    txt_title = html.escape(txt_title)
    txt_subtitle = html.escape(txt_subtitle)

    # Normalize text for HTML, parse our custom tags, then convert all whitespace as needed
    txt = html.escape(txt)    
    ret = txt_parse_tags(txt, data_folder) # custom "tags" parsing
    if ret == None:
        return None
    txt = ret[0] # from here on, we do all the required transforms to go from a .txt to a valid HTML

    # replace multiple spaces or spaces at the beginning of a line
    txt = re.sub('\n[ ]+', lambda m:'<br/>'+('&nbsp'*(len(m.group(0))-1)), txt)
    txt = re.sub(' [ ]+', lambda m:'&nbsp'*len(m.group(0)), txt)
    txt = txt.replace('\t', '&nbsp&nbsp&nbsp&nbsp') # could use &emsp but this seems better for copy-paste?
    txt = txt.replace('\n','<br/>\n'); # replace any other newlines

    unicode_error = txt.find(u'\uFFFD')
    if unicode_error!=-1: # something bad happened
        print(tty_c.RED,"ERROR: unicode replacement character found",tty_c.ENDC,txt[max(0,unicode_error-8):min(len(txt),unicode_error+8)])
    
    txt += additional_html;
    # below, we know/rely on the fact that the filename (minus extension) is == the data_folder name
    page = template.format(page_title=txt_title, page_url=f"{WEBURL}/{data_folder}.htm", title=txt_title, subtitle=txt_subtitle, body=txt, home_link=HOME, footer=txt_footer, include=include_head)
    
    if not LINKS_PREPASS_MODE: 
        # why did I do this - and did not instead always go for absolute paths??!? maybe it was because it works better w/local HTML preview (no webserver)
        page = page.replace("##PATH##","") # we use relative paths here - because all .txt are in the website root
    else:
        page = page.replace("##PATH##","/") # similar to when we generate the journal htm - we need absolute paths

    return (page, txt_title, txt_subtitle, ret[1])

# Simpler, does not parse fancy tags or anything...
def plain_text_to_html(txt): # mostly deals with <br> and &nbsp...
    if txt.find('’')!=-1 or txt.find('“')!=-1 or txt.find('”')!=-1:
        print(tty_c.YELLOW,"WARNING: fancy characters found (’“”)",tty_c.ENDC)

    txt = txt.strip()

    # Normalize text for HTML, parse our custom tags, then convert all whitespace as needed
    txt = html.escape(txt)    

    # replace multiple spaces or spaces at the beginning of a line
    txt = re.sub('\n[ ]+', lambda m:'<br/>'+('&nbsp'*(len(m.group(0))-1)), txt)
    txt = re.sub(' [ ]+', lambda m:'&nbsp'*len(m.group(0)), txt)
    txt = txt.replace('\t', '&nbsp&nbsp&nbsp&nbsp') # could use &emsp but this seems better for copy-paste?
    txt = txt.replace('\n','<br/>\n'); # replace any other newlines

    unicode_error = txt.find(u'\uFFFD')
    if unicode_error!=-1: # something bad happened
        print(tty_c.RED,"ERROR: unicode replacement character found",tty_c.ENDC,txt[max(0,unicode_error-8):min(len(txt),unicode_error+8)])

    return txt

# Weird but Pillow takes a palette from an image, no other way, and the palette should always be 256 entries
PALETTE_colors = (255,0,0, 0,255,0, 0,0,255, 0,0,0, 255,255,255)
PALETTE_img = Image.new('P', (16,16))
PALETTE_img.putpalette(PALETTE_colors+PALETTE_colors[-3:]*(255 - len(PALETTE_colors)//3))

def gen_thumbs(dir): # going for stylization...
    dir = FOLDER+'/'+dir
    for filename in os.listdir(dir):
        if filename.find('.PNG')!=-1 or filename.find('.JPG')!=-1 or filename.find('.jpeg')!=-1: # common mistakes...
            print(tty_c.YELLOW," gen_thumbs: uppercase .PNG/.JPG and .jpeg will be skipped, probably wrong?!",tty_c.ENDC)
            continue
        if filename[0]=='.' or (filename.find('.png')==-1 and filename.find('.jpg')==-1):
            continue

        outDir = dir + '/' + THUMB_SUBFOLDER
        if not os.path.exists(outDir) :
           os.mkdir(outDir)

        outfile = outDir+'/'+filename[:-3]+"png"
        if os.path.exists(outfile) and not FORCE_GEN_THUMBS:
            continue

        print(" gen_thumbs:", filename)
        with Image.open(dir+'/'+filename) as img:
            img = ImageOps.exif_transpose(img).convert("RGB")
            img = ImageOps.autocontrast(img)
            img = ImageEnhance.Sharpness(img).enhance(2.0)
            img.thumbnail( (IMAGE_MAX_WIDTH, img.height), Image.Resampling.LANCZOS ) # thumbnail preserves aspect ratio
            img = ImageEnhance.Sharpness(img).enhance(1.5)
            #img = img.quantize(16) # Image.Quantize.LIBIMAGEQUANT not compiled in the anaconda version of PIL - it seems
            img = img.quantize(palette=PALETTE_img, dither=Image.Dither.FLOYDSTEINBERG)
            img.save(outfile, None, optimize=True, compress_level=9)

print("\n====================================================================================================================")

print("Loading database (if exists)")

DBPATH = FOLDER+"/txt2web.pickle"
if os.path.exists(DBPATH):
    with open(DBPATH, 'rb') as f: 
        DB = pickle.load(f)
else:
    DB = {}

def process_all_txt():
    for filename in os.listdir(FOLDER):
        if filename.find('.TXT')!=-1:
            print(tty_c.YELLOW," uppercase .TXT will be skipped, probably wrong?!",tty_c.ENDC) # "common" mistake...
            continue

        if filename[0]!='.' and filename.find('.txt')!=-1 and filename!=INDEX_NAME and filename!='robots.txt':
            print(tty_c.BOLD,"\nProcessing:",filename,tty_c.ENDC)
            filepath = FOLDER+"/"+filename

            if not check_websafe(filename):
                print(tty_c.YELLOW," Not websafe filename, BAILING OUT!",tty_c.ENDC)
                continue

            with open(filepath, 'r', encoding='utf-8') as f:
                txt = f.read()
            txt_hash = hashlib.md5(txt.encode("utf-8")).hexdigest()

            out_filepath = FOLDER+"/"+filename.replace(".txt",".htm")
            in_db_and_not_found = filename in DB and not os.path.exists(out_filepath)

            if in_db_and_not_found:
                print(tty_c.YELLOW,"Warning: .txt was in the database, but .htm not found, re-generating",tty_c.ENDC)

            if (not filename in DB) or DB[filename]['hash']!=txt_hash or FORCE_UPDATE or in_db_and_not_found:
                timeformat = "%Y-%m-%d, %A, %B"

                processtimestr = datetime.now().astimezone(PTz).strftime(timeformat)
                modtimestr = datetime.fromtimestamp(os.path.getmtime(filepath)).astimezone(PTz).strftime(timeformat)
                timestr = modtimestr if USE_FILEDATE else processtimestr

                if filename in DB:
                    txt_footer = DB[filename]['txt_footer']
                    if (FORCE_UPDATE or in_db_and_not_found) and DB[filename]['hash']==txt_hash:
                        print( " No changes, but forced to re-generate")
                    else:
                        print( " Change detected!")
                        if DB[filename]['orig_time'][:10] != timestr[:10]: # avoid using the "last updated" footer if the change is within the same day
                            txt_footer = DB[filename]['orig_time'] + " (updated: " + timestr + ")"
                else:
                    print( " New!")
                    txt_footer = timestr

                data_folder = FOLDER+'/'+filename.replace(".txt","")
                if os.path.exists(data_folder):
                    gen_thumbs(data_folder)
                out = txt_to_html(txt, PAGE_TEMPLATE, txt_footer, data_folder, "", INCLUDE_HTML)
                if out == None:
                    print(tty_c.RED,"ERROR in txt_to_html",tty_c.ENDC)
                    continue
                
                with open(out_filepath, 'w', encoding='utf-8') as f:
                    f.write(out[0])

                # update DB
                if filename in DB: # for time we keep both the original and the last seen
                    DB[filename]['last_time'] = timestr
                    DB[filename]['last_filetime'] = modtimestr
                    DB[filename]['last_processtime'] = processtimestr
                else:
                    DB[filename] = {'orig_time':timestr, 'orig_filetime':modtimestr, 'orig_processtime':processtimestr}
                DB[filename]['hash'] = txt_hash
                DB[filename]['txt_footer'] = txt_footer
                DB[filename]['txt_title'] = out[1]
                DB[filename]['txt_subtitle'] = out[2]
                DB[filename]['warnings'] = out[3]
            else:
                print(" Already processed, skipping...")

print("Scanning for all .txt in", FOLDER)
process_all_txt()

print("\n--------------------------------------------------------------------------------------------------------------------")

JOURNAL_ROOT = None
JOURNAL_LAST_POST_TITLE = ""

def process_journal():
    global JOURNAL_ROOT, JOURNAL_LAST_POST_TITLE
    journal_parsed = load_and_parse_journal()

    for sec_idx,section in enumerate(journal_parsed):
        if len(section[1]) == 0:
            continue
        if JOURNAL_LAST_POST_TITLE == "":
            JOURNAL_LAST_POST_TITLE = section[1][0][1] # title of the first post in the first valid section (= the last post chronologically...)

        print("Section:", sec_idx,"posts:",len(section[1]))
        split_id = section[0]
        posts = []
        subjects = []

        for post in section[1]:
            # post is (0:date, 1:subject, 2:id, 3:text)
            txt = plain_text_to_html(post[3])
            txt = JOURNAL_POST_TEMPLATE.format(date=post[0], subject=post[1], id=post[2], text=txt)
            posts.append(txt)
            subjects.append( (post[1],post[2]) )
        posts = "\n".join(posts)

        journal_page = f"log_{split_id:03d}"
        if JOURNAL_ROOT is None: # extract the most recent section
            JOURNAL_ROOT = (journal_page+".htm", section[1][0][0]) # extract the date of the most recent post

        footer_txt = ""
        if sec_idx+1 < len(journal_parsed):
            next_journal_page = f"log_{journal_parsed[sec_idx+1][0]:03d}"
            footer_txt = f"<a href='{next_journal_page}.htm'>[Previous log file]</a>"

        # add a small index for the section
        subjects = [ f"- <a href='#{s[1]}'>{s[0]}</a>" for s in subjects ]
        # add a link to the pervious journal page (if any)
        posts = ("<br/>".join(subjects))+(f"<br/>\n{footer_txt}" if footer_txt!="" else f"<br/>\n<a href='{WEBURL}'>[Home]</a>")+"<hr/>" + posts

        assert JOURNAL_DIR[:2] == "./" # I'm being lazy w/the replace below...
        page_url = JOURNAL_DIR.replace('./', WEBURL+'/')+journal_page+".htm"
        page_txt = PAGE_TEMPLATE.format(page_title=journal_page, page_url=page_url, title="c0de517e's journal", subtitle=journal_page, body=posts, home_link=HOME, footer=footer_txt, include=INCLUDE_HTML)

        page_txt = page_txt.replace("##PATH##","/") # we use absolute paths here - because this is used for the journal, which is in a subdirectory
        # this also implies unfortunately that the journal won't preview 100% correctly from local files...

        with open(JOURNAL_DIR+journal_page+".htm", 'w') as f:
            f.write(page_txt)

if not LINKS_PREPASS_MODE:
    print(tty_c.BOLD,"\nProcessing Journal",tty_c.ENDC)
    process_journal()

print("\n--------------------------------------------------------------------------------------------------------------------")

def process_index_and_rss():
    indexpath = FOLDER+"/"+INDEX_NAME
    assert os.path.exists(indexpath)

    with open(indexpath, 'r') as f:
        txt = f.read()

    DBlist = [ (DB[x]['orig_time'], x) for x in DB ]
    DBlist.sort(reverse = True)

    rss_items = []
    html_list = ""
    article_last_post_date = ""
    for x in DBlist:
        if not os.path.exists(FOLDER+'/'+x[1]):
            if REMOVE_TXT_NOT_FOUND:
                print(" Removing from DB: "+x[1])
                DB.pop(x[1])
            else:
                print(" Warning - file in DB but not on disk, won't index, please fix the DB: "+x[1])
            continue

        if x[1][0]=='_' : # hide files starting with _
            continue

        entry = DB[x[1]]
        if entry['warnings']: # also hide files that had warnings
            continue

        if article_last_post_date == "":
            article_last_post_date = f"{entry['orig_time'][:10]}"

        url = x[1].replace('.txt', '.htm')
        html_list += f"<li><a href='{url}'>{entry['txt_title']}</a></li>\n"

        # https://www.w3schools.com/xml/xml_rss.asp#rssref - I don't think I need anything else - checked with https://validator.w3.org/feed/
        if len(rss_items) < RSS_MAX_ITEMS:
            rss_items.append(
                f'<item>\n <title>{entry["txt_title"]}</title>\n <link>{WEBURL+"/"+url}</link><guid isPermaLink="false">{url}</guid>\n <description>{entry["txt_footer"]}</description>\n</item>'
            ) # guid can be any string (if isPermaLink=false)

    # I make the list look like it was made with standard text, the reason I use a list is that lines wrap after the separator, which I like on small screens
    html_list = f'<b>Articles: </b>(updated: {article_last_post_date} - recent on top)\n<ul style="margin:0; padding-inline-start:1em; list-style:\'- \'">\n{html_list}\n</ul><br/>\n'

    if not LINKS_PREPASS_MODE: # Create the special section for the links to the journal
        journal_list = "" # HTML generated for the journal/journal links section

        if JOURNAL_ROOT is not None:
            print(tty_c.BOLD,"\nInjecting Journal into index and RSS",tty_c.ENDC)

            journal_root_date = datetime.strptime(JOURNAL_ROOT[1],"%a, %d %b %Y %H:%M:%S %z")
            journal_root_date = journal_root_date.astimezone(PTz).strftime("%Y-%m-%d")
            journal_list = f"\n<br/><b>Journal: </b>(updated: {journal_root_date})<br/>\n- Last: <a href='{JOURNAL_DIR}{JOURNAL_ROOT[0]}'>{JOURNAL_LAST_POST_TITLE}</a>"

            assert JOURNAL_DIR[:2] == "./" # I'm being lazy w/the replace below...
            rss_items.insert(0, 
                f'<item>\n <title>Journal</title>\n <link>{JOURNAL_DIR.replace("./", WEBURL+"/")}{JOURNAL_ROOT[0]}</link><guid isPermaLink="false">{uuid.uuid4()}</guid>\n <description>c0de517e journal - last update:{JOURNAL_LAST_POST_TITLE} on {journal_root_date}</description>\n</item>')

        lastLinksN = get_last_links_txt_no()
        if lastLinksN>=0:
            print(tty_c.BOLD,"\nInjecting 'Links' into index and RSS",tty_c.ENDC)

            lastLinksFileName = str(lastLinksN).zfill(5)
            links_date = os.path.getctime( os.path.join(JOURNAL_LINKS_DIR, lastLinksFileName+".txt") ) # file creation time - good enough
            links_date = datetime.fromtimestamp(links_date)
            links_date = links_date.astimezone(PTz).strftime("%Y-%m-%d")

            lastLinksHtmName = lastLinksFileName + '.htm'
            links_list = f"\n<br/><b>Bookmarks: </b>(updated: {links_date})<br/>\n- Last: <a href='{JOURNAL_LINKS_DIR}{lastLinksHtmName}'>{lastLinksFileName}</a> (<a href='{JOURNAL_LINKS_DIR}index.htm'>previous ones</a>)<br/><br/>"
            journal_list += links_list

            assert JOURNAL_LINKS_DIR[:2] == "./" # Copy-and-pasted from above...
            rss_items.insert(0, 
                f'<item>\n <title>Bookmarks</title>\n <link>{JOURNAL_LINKS_DIR.replace("./", WEBURL+"/")}{lastLinksHtmName}</link><guid isPermaLink="false">{uuid.uuid4()}</guid>\n <description>c0de517e bookmarks - last update:{lastLinksFileName} on {links_date}</description>\n</item>')
        else:
            journal_list += '<br/><br/>'

        # "Inject" the journal links at the top, before the list of articles
        html_list = journal_list + html_list

    # Finalize & Write everything out

    rss_items = "\n".join(rss_items)
    out = txt_to_html(txt, INDEX_PAGE_TEMPLATE, "txt2web "+str(txt2web_version), "index", html_list, INCLUDE_HTML_INDEX)

    if out!=None:
        out_filepath = indexpath.replace('.txt', '.htm')
        with open(out_filepath, 'w', encoding='utf-8') as f:
            f.write(out[0])

        if not LINKS_PREPASS_MODE: # Could also remove the rest of RSS generation as we never need it for links pass - but it doesn't hurt I guess
            with open(FOLDER+"/rss.xml", 'w', encoding='utf-8') as f:
                f.write(RSS_TEMPLATE.format(url=WEBURL, items=rss_items))
    else:
        print(tty_c.RED,"ERROR in txt_to_html, processing index",tty_c.ENDC)

print(tty_c.BOLD,"\nGenerating index and RSS",tty_c.ENDC)
process_index_and_rss()

if not DO_NOT_UPDATE_DB:
    with open(DBPATH, 'wb') as f: 
        pickle.dump(DB, f)
else:
    print("\nPAGES GENERATED, BUT DB HAS NOT BEEN SAVED!")
