Useful Hacks @ ssokolow.com

#!/usr/bin/env python # -*- coding: utf-8 -*- """ A single-file Python CGI script for effortless sharing of other single-file scripts. If you're viewing a "Useful Hacks" list on my website, this is the code behind it. Simply put your desired description into each file's docstring (for shell scripts, it takes every commented line starting with the shabang and ending with the first non-comment line) and drop them into a folder along with this script. Currently supports Bourne-compatible shell scripts and Python scripts. Other languages under consideration. Non-obvious Features: - Hyperlinks URLs and obfuscates e-mail addresses in script descriptions. - Configurable license name hyperlinking Warnings: - The HTML templating is a quick hackjob. I'm not kidding. - Don't forget to remove the template bits specific to my site. TODO: - Switch to a proper templating solution? (No longer a single-file script) - Add caching eventually (current run time for my site, 0.1 seconds) - Add a 5px inset border and subtle "rounded CRT glare" gradients to
"""

__appname__ = "Lazybones Script Lister"
__author__  = "Stephan Sokolow (deitarion/SSokolow)"
__version__ = "0.3.1"
__license__ = "GNU GPL 2.0 or later"

import cgi, os, parser, re, time, token, urllib
from xml.sax.saxutils import escape as xml_escape

DEFAULT_LICENSE = "GNU GPL 2.0 or newer"

LICENSES = {
        re.compile("(^|\b)((GNU )?(A|Affero )(General Public License|GPL)[ ]?v?3(\.0)?)", re.IGNORECASE): "http://www.gnu.org/licenses/agpl-3.0.html",
        re.compile("(^|\b)((GNU )?(General Public License|GPL)[ ]?v?2(\.0)?)", re.IGNORECASE): "http://www.gnu.org/licenses/gpl-2.0.html",
        re.compile("(^|\b)((GNU )?(General Public License|GPL)[ ]?v?3(\.0)?)", re.IGNORECASE): "http://www.gnu.org/licenses/gpl-3.0.html",
        re.compile("(^|\b)((GNU )?(L|Lesser |Library )(General Public License|GPL)[ ]?v?2\.1)", re.IGNORECASE): "http://www.gnu.org/licenses/lgpl-2.1.html",
        re.compile("(^|\b)((GNU )?(L|Lesser |Library )(General Public License|GPL)[ ]?v?3(\.0)?)", re.IGNORECASE): "http://www.gnu.org/licenses/lgpl-3.0.html",
        re.compile("(^|\b)((Mozilla Public License|MPL)[ ]?v?1\.1)", re.IGNORECASE): "https://www.mozilla.org/MPL/1.1/",
        re.compile("(^|\b)((Mozilla Public License|MPL)[ ]?v?2(\.0)?)", re.IGNORECASE): "https://www.mozilla.org/MPL/2.0/",
        re.compile("(^|\b)(Apache (License )?v?2(\.0)?)", re.IGNORECASE): "http://www.opensource.org/licenses/apache2.0.php",
        re.compile("(^|\b)(Artistic (License )?v?2(\.0)?)", re.IGNORECASE): "http://www.perlfoundation.org/artistic_license_2_0",
        re.compile("(^|\b)(PSF (License )?(\d\.\d)?)", re.IGNORECASE): "http://docs.python.org/license.html",
        re.compile("(^|\b)((Old|Original|4-clause)[ ]?BSD( License)?)", re.IGNORECASE): "https://en.wikipedia.org/wiki/BSD_licenses#4-clause_license_.28original_.22BSD_License.22.29",
        re.compile("(^|\b)((New|Modified|3-clause)[ ]?BSD( License)?)", re.IGNORECASE): "http://www.opensource.org/licenses/BSD-3-Clause",
        re.compile("(^|\b)((2-clause |Simplified |Free)BSD( License)?)", re.IGNORECASE): "http://www.opensource.org/licenses/BSD-2-Clause",
        re.compile("(^|\b)((MIT|X11)( License)?)", re.IGNORECASE): "http://www.opensource.org/licenses/MIT",
        re.compile("(^|\b)((Eclipse Public License|EPL)[ ]?v?1?(\.0)?)", re.IGNORECASE): "http://www.eclipse.org/legal/epl-v10.html"
}

HTACCESS = """
Options -ExecCGI
SetHandler default-handler
DirectoryIndex index.html
"""

PAGE_HEADER = """

    
        Useful Hacks @ ssokolow.com

        
        
        

    
    
"""

BODY_HEADER = """
        
          Useful Hacks [Projects]
          This page lists scripts I quickly hacked up to solve a problem but
                haven't had time to clean up for general use. Feel free to use
                them if you like.
            Note:
                quicktile.py is now available as
                ssokolow/quicktile on GitHub.
            
            Note:
                Find Dupes Fast (A.K.A.
                fastdupes.py)
                is now available as
                ssokolow/fastdupes on GitHub.
            
        
"""

PAGE_FOOTER = """
        
        """ + time.strftime("This page generated at %Y-%m-%d %H:%M UTC", time.gmtime()) + """
        
        
        
        
        
    
"""

#TODO: Make use of this regex to sanitize input before using it in HTML/XML.
#(Should also be sanitizing 0xD800-0xDFFF, 0xFFFE-0xFFFF, and 0x110000, but
# that has to wait until I've added support for parsing and honoring encoding
# declarations)
control_char_re      = re.compile('[\x00-\x09\x0B\x0C\x0E-\x1F]')

bad_anchor_char_re   = re.compile('[^A-Za-z0-9-_:.]+')
hyperlinkable_url_re = re.compile(r"""((?:ht|f)tps?://[^\s()]+(?:\([^\s()]*\)[^\s()]*)*)""", re.IGNORECASE | re.UNICODE)

_bc = r"""!@#$%^&*()=+{}[\]|\;:'"/?>,<\s"""
email_address_re     = re.compile(r"""(?P[^%s]+@[^%s]+\.[^%s]*[^.%s])""" % (_bc, _bc, _bc, _bc), re.UNICODE)
del _bc

class ScriptEntry(object):
    _metadata = {
        'name'        : '',
        'filepath'    : '',
        'filename'    : '',
        'filesize'    : 0,
        'filetime'    : 0,
        'language'    : '',
        'description' : '',
        'anchor'      : '',
        'license'     : DEFAULT_LICENSE,
        'version'     : '',
    }

    shabang_re = None
    license_re = None
    extensions = []
    anchors = []        # Static

    def __cmp__(self, other):
        """Make ScriptEntry objects case-insensitive sortable by name."""
        return cmp(self.metadata['name'].lower(), other.metadata['name'].lower())

    def __init__(self, filename):
        self.metadata = self._metadata.copy()

        tmp = os.stat(filename)

        # Store all the metadata that isn't format-specific.
        _ = self.metadata
        _['filepath'] = os.path.normpath(filename)
        _['filename'] = os.path.basename(self.metadata['filepath'])
        _['filesize'] = tmp.st_size
        _['filetime'] = tmp.st_mtime

        # Construct a hyperlinkable anchor from the filename
        _['anchor'] = bad_anchor_char_re.sub('_', _['filename']).lower()
        if not _['anchor'][0].isalpha():
            _['anchor'] = 'a' + _['anchor']

        # Ensure no duplicate anchors
        if _['anchor'] in self.anchors:
            count = 0
            while ('%s%d' % (_['anchor'], count)) in self.anchors:
                count += 1
            _['anchor'] = '%s%d' % (_['anchor'], count)
        self.anchors.append(_['anchor'])

        # Make sure that the filename will be used as a fallback program name.
        _['name'] = _['filename']

        # Actually extract the metadata.
        self._do_init()

        # Allow controlled truncation of module docstrings.
        for marker in ('--snip--', '--clip--'):
            if '\n%s\n' % marker in _['description']:
                _['description'] = _['description'].split('\n%s\n' % marker, 1)[0] + '\n[...]'

        # Add various pretty-printed and escaped values to the metadata dict.
        _.update({
            'fname_q': urllib.quote_plus(self.metadata['filename']),
            'fsize_p': formatFileSize(self.metadata['filesize']),
            'desc_e': xml_escape(self.metadata['description']),
            'mtime': time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime(self.metadata['filetime']))
        })

        # Hyperlink all the URLs in the description.
        _['desc_e'] = hyperlinkable_url_re.sub(r'\1', _['desc_e'])

        # Add some spam protection to any e-mail addresses
        _['desc_e'] = email_address_re.sub(spamProtectEmail, _['desc_e'])

        # Hyperlink any licenses we can.
        _['license_h'] = _['license']
        for regex in LICENSES:
            if regex.search(_['license']):
                _['license_h'] = regex.sub(r'\2' % LICENSES[regex], _['license'])

    def _do_init(self):
        """Code to actually extract format-specific metadata goes here."""
        raise NotImplementedError("Cannot instantiate abstract class")

    def render(self, offline=False):
        if offline:
            self.metadata['get_url'] = self.metadata['filename']
        else:
            self.metadata['get_url'] = '?get=' + self.metadata['fname_q']

        output = '%(name)s
            
            
            
                Size: %(fsize_p)s
                """ % self.metadata
        if self.metadata['version']:
            output += '\nVersion: %(version)s\n' % self.metadata
        output += """License: %(license_h)s
                Language: %(language)s
                Last Modified: %(mtime)s
            
            %(desc_e)s""" % self.metadata
        return output

class PythonScriptEntry(ScriptEntry):
    shabang_re = re.compile('^#!(/usr(/local)?)?/bin/(env )?python')
    extensions = ['.py']

    _variable_re = r"""^%s\s*=\s*(?P'{1,3}|\"{1,3})(?P.+?)(?P=delim)\s*$"""
    _metadata_regexes = {
            'license': re.compile(_variable_re % '__license__', re.MULTILINE),
            'name'   : re.compile(_variable_re % '__appname__', re.MULTILINE),
            'version': re.compile(_variable_re % '__version__', re.MULTILINE)
    }

    def _do_init(self):
        _ = self.metadata
        _['language'] = 'Python'

        # Load the file and extract all metadata but the description.
        filecontents = open(_['filepath'], 'rU').read()
        for key in self._metadata_regexes:
            match_obj = self._metadata_regexes[key].search(filecontents)
            if match_obj:
                self.metadata[key] = match_obj.group('value')

        # Parse out the module docstring as the description.
        try:
            _['description'] = self._get_docstring(filecontents)
        except:
            _['description'] = "ERROR: Unable to parse file."

    def _get_docstring(self, tup):
        """
        Module docstring extractor.
        Written because Demo/parser/example.py DOESN'T WORK.
        """
        if isinstance(tup, basestring):
            tup = parser.suite(tup).totuple()

        if tup[0] == token.STRING:
            return tup[1]
        for value in tup:
            if isinstance(value, tuple):
                val = self._get_docstring(value)
                if val:
                    return val

class ShellScriptEntry(ScriptEntry):
    shabang_re = re.compile('^#!(/usr(/local)?)?/bin/(env )?(ba|k)?sh$')
    extensions = ['.sh']

    _license_re = re.compile(r"""^#\s*(Licensed|Released) under (the|a) (?P.+?)(\slicense)?\.?\s*$""", re.M | re.I)

    def _do_init(self):
        _ = self.metadata
        _['language'] = 'Bourne Shell Script'

        # Extract the comment block header as the description if present
        lines = []
        for line in file(_['filepath']):
            line = line.strip()
            if line.startswith('#'):
                lines.append(line)
            else:
                break
        _['description'] = '\n'.join(lines)

        # Extract the license info if present
        match_obj = self._license_re.search(_['description'])
        if match_obj:
            self.metadata['license'] = match_obj.group('license')

entryClasses = [PythonScriptEntry, ShellScriptEntry]

def spamProtectEmail(match_obj):
    """Use this as the replacement in a regex substitution with
    email_address_re to provide some degree of spam protection for e-mail
    addresses in docstrings.

    XXX: Should I add some randomness to the obfuscation approach?"""
    maps = {'@': ' at ', '.': ' dot '}

    email = match_obj.group(0)
    for char in maps:
        email = email.replace(char, maps[char])

    return email

def formatFileSize(size, unit='', precision=0):
    """Take a size in bits or bytes and return it all prettied
    up and rounded to whichever unit gives the smallest number.

    A fixed unit can be specified. Possible units are B, KB,
    MB, GB, TB, and PB so far. Case-insensitive.

    Works on both negative and positive numbers. In the event
    that the given value is in bits, the user will have to
    use result = result[:-1] + 'b' to make it appear correct.

    Will calculate using integers unless precision is != 0.
    Will display using integers unless precision is > 0."""

    # Each unit's position in the list is crucial.
    # units[2] = 'MB' and size / 1024**2 = size in MB
    units = ['bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']

    if precision:
        size = float(size)

    # Did the calling function specify a valid unit of measurement?
    if unit and unit.upper() in units:         # If so, find the unit index by searching.
        unit_idx = units.index(unit)
        size /= (1024 ** unit_idx)
    else:                                      # If not, find the unit index by iteration.
        unit_idx = 0
        while abs(size) > 1024 and unit_idx < (len(units) - 1):
            size /= 1024
            unit_idx += 1

    return '%.*f %s' % (precision, size, units[unit_idx])

def list_content(path='.', offline=False):
    """Generate an HTML listing of available files, complete with metadata"""
    scripts, categories, path = [], [], os.path.abspath(path)

    for name in os.listdir(os.path.abspath(path)):
        fpath = os.path.join(path, name)

        if os.path.isdir(fpath):
            pass  # TODO: Support categories.
        else:
            ext = os.path.splitext(name)[1]
            for ec in entryClasses:
                if ext in ec.extensions:
                    scripts.append(ec(name))
                    continue

                lineOne = file(name).readline()
                if ec.shabang_re.match(lineOne):
                    scripts.append(ec(name))
    scripts.sort()

    output = [PAGE_HEADER]
    output.append("Table of Contents
")
    for entry in scripts:
        tmp = 'Back to Parent Site")
    output.append(BODY_HEADER)

    if categories:
        output.append("Categories")  # TODO: Add this to the table of contents.

    for entry in scripts:
        output.append(entry.render(offline=offline))
    output.append(PAGE_FOOTER)

    return '\n'.join(output)

if __name__ == '__main__':
    from optparse import OptionParser
    opt_parser = OptionParser(description=__doc__, version="%%prog v%s" % __version__)
    opt_parser.add_option('--offline', action="store_true", dest="offline",
        default=False, help="Generate a static index.html and .htaccess")

    # Allow pre-formatted descriptions
    opt_parser.formatter.format_description = lambda description: description

    opts, args = opt_parser.parse_args()

    if opts.offline:
        with open('index.html', 'w') as fh:
            fh.write(list_content(offline=True))
        with open('.htaccess', 'w') as fh:
            fh.write(HTACCESS)
    else:
        form = cgi.FieldStorage()
        if 'get' in form:
            print("Content-Type: text/html; charset=utf-8")
            print('')
            print(list_content())
        else:
            fname = os.path.normpath(form['get'].value)
            if not os.path.abspath(fname).startswith(os.getcwd()) or not os.path.isfile(fname):
                print("Content-Type: text/html; charset=utf-8")
                print('')
                print(PAGE_HEADER)
                print("Unfortunately, you have requested an invalid file. "
                      "Please try again.")
                print(PAGE_FOOTER)
            else:
                print("Content-Type: text/plain")
                print('')
                print(file(form['get'].value).read())