From 80966b7fb821bee72730877d36cb8a9db7d43cd3 Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 20 Nov 2015 04:08:53 -0800 Subject: [PATCH] else --- .gitignore | 54 +- AHK/rapid_t.ahk | 11 + Decorner/decorner.py | 1 + ICO/icoconvert.py | 3 +- Passwordy/passwordy.py | 23 +- ProjectileParabola/projectileparabola.py | 25 +- ProjectileParabola/projectiles.png | 4 +- SnudownTest/.gitignore | 10 + SnudownTest/.gitmodules | 4 + SnudownTest/Python.h | 133 ++ SnudownTest/SECURITY.md | 12 + SnudownTest/autolink.c | 487 ++++ SnudownTest/autolink.h | 59 + SnudownTest/buffer.c | 236 ++ SnudownTest/buffer.h | 100 + SnudownTest/debian/changelog | 145 ++ SnudownTest/debian/compat | 1 + SnudownTest/debian/control | 15 + SnudownTest/debian/copyright | 30 + SnudownTest/debian/rules | 9 + SnudownTest/debian/source/format | 1 + SnudownTest/fuzzing/CMakeLists.txt | 37 + SnudownTest/fuzzing/Makefile | 62 + SnudownTest/fuzzing/gen_testcases.py | 20 + SnudownTest/fuzzing/snudown-validator.c | 226 ++ SnudownTest/fuzzing/triageerrors.sh | 2 + SnudownTest/fuzzing/validatemd.sh | 3 + SnudownTest/gperf.exe | 3 + SnudownTest/houdini.h | 37 + SnudownTest/houdini_href_e.c | 116 + SnudownTest/houdini_html_e.c | 87 + SnudownTest/html.c | 790 +++++++ SnudownTest/html.h | 83 + SnudownTest/html/houdini.h | 37 + SnudownTest/html/houdini_href_e.c | 116 + SnudownTest/html/houdini_html_e.c | 87 + SnudownTest/html/html.c | 790 +++++++ SnudownTest/html/html.h | 83 + SnudownTest/html/html_smartypants.c | 389 ++++ SnudownTest/html_block_names.txt | 25 + SnudownTest/html_blocks.h | 206 ++ SnudownTest/html_entities.gperf | 292 +++ SnudownTest/html_smartypants.c | 389 ++++ SnudownTest/markdown.c | 2661 ++++++++++++++++++++++ SnudownTest/markdown.h | 140 ++ SnudownTest/setup.py | 56 + SnudownTest/snudown - Copy.c | 212 ++ SnudownTest/snudown-validator.c | 226 ++ SnudownTest/snudown.c | 232 ++ SnudownTest/src/autolink.c | 487 ++++ SnudownTest/src/autolink.h | 59 + SnudownTest/src/buffer.c | 236 ++ SnudownTest/src/buffer.h | 100 + SnudownTest/src/html_blocks.h | 206 ++ SnudownTest/src/html_entities.gperf | 292 +++ SnudownTest/src/markdown.c | 2661 ++++++++++++++++++++++ SnudownTest/src/markdown.h | 140 ++ SnudownTest/src/stack.c | 81 + SnudownTest/src/stack.h | 29 + SnudownTest/stack.c | 81 + SnudownTest/stack.h | 29 + SnudownTest/stdint.h | 199 ++ SnudownTest/sundown.def | 20 + SnudownTest/test_snudown.py | 461 ++++ Symlinker/symlinker.py | 153 ++ gitnotes.txt | 1 + 66 files changed, 13955 insertions(+), 50 deletions(-) create mode 100644 AHK/rapid_t.ahk create mode 100644 SnudownTest/.gitignore create mode 100644 SnudownTest/.gitmodules create mode 100644 SnudownTest/Python.h create mode 100644 SnudownTest/SECURITY.md create mode 100644 SnudownTest/autolink.c create mode 100644 SnudownTest/autolink.h create mode 100644 SnudownTest/buffer.c create mode 100644 SnudownTest/buffer.h create mode 100644 SnudownTest/debian/changelog create mode 100644 SnudownTest/debian/compat create mode 100644 SnudownTest/debian/control create mode 100644 SnudownTest/debian/copyright create mode 100644 SnudownTest/debian/rules create mode 100644 SnudownTest/debian/source/format create mode 100644 SnudownTest/fuzzing/CMakeLists.txt create mode 100644 SnudownTest/fuzzing/Makefile create mode 100644 SnudownTest/fuzzing/gen_testcases.py create mode 100644 SnudownTest/fuzzing/snudown-validator.c create mode 100644 SnudownTest/fuzzing/triageerrors.sh create mode 100644 SnudownTest/fuzzing/validatemd.sh create mode 100644 SnudownTest/gperf.exe create mode 100644 SnudownTest/houdini.h create mode 100644 SnudownTest/houdini_href_e.c create mode 100644 SnudownTest/houdini_html_e.c create mode 100644 SnudownTest/html.c create mode 100644 SnudownTest/html.h create mode 100644 SnudownTest/html/houdini.h create mode 100644 SnudownTest/html/houdini_href_e.c create mode 100644 SnudownTest/html/houdini_html_e.c create mode 100644 SnudownTest/html/html.c create mode 100644 SnudownTest/html/html.h create mode 100644 SnudownTest/html/html_smartypants.c create mode 100644 SnudownTest/html_block_names.txt create mode 100644 SnudownTest/html_blocks.h create mode 100644 SnudownTest/html_entities.gperf create mode 100644 SnudownTest/html_smartypants.c create mode 100644 SnudownTest/markdown.c create mode 100644 SnudownTest/markdown.h create mode 100644 SnudownTest/setup.py create mode 100644 SnudownTest/snudown - Copy.c create mode 100644 SnudownTest/snudown-validator.c create mode 100644 SnudownTest/snudown.c create mode 100644 SnudownTest/src/autolink.c create mode 100644 SnudownTest/src/autolink.h create mode 100644 SnudownTest/src/buffer.c create mode 100644 SnudownTest/src/buffer.h create mode 100644 SnudownTest/src/html_blocks.h create mode 100644 SnudownTest/src/html_entities.gperf create mode 100644 SnudownTest/src/markdown.c create mode 100644 SnudownTest/src/markdown.h create mode 100644 SnudownTest/src/stack.c create mode 100644 SnudownTest/src/stack.h create mode 100644 SnudownTest/stack.c create mode 100644 SnudownTest/stack.h create mode 100644 SnudownTest/stdint.h create mode 100644 SnudownTest/sundown.def create mode 100644 SnudownTest/test_snudown.py create mode 100644 Symlinker/symlinker.py create mode 100644 gitnotes.txt diff --git a/.gitignore b/.gitignore index 9112c56..f39cf74 100644 --- a/.gitignore +++ b/.gitignore @@ -1,34 +1,36 @@ -Classifieds/ - -# Windows image file caches -Thumbs.db -ehthumbs.db - -# Folder config file -Desktop.ini - -# Recycle Bin used on file shares -$RECYCLE.BIN/ - -# Windows Installer files -*.cab -*.msi -*.msm -*.msp - -# ========================= -# Operating System Files -# ========================= - -# OSX -# ========================= - +AwfulCrateBox/ +Classifieds/ + +# Windows image file caches +Thumbs.db +ehthumbs.db + +# Folder config file +Desktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msm +*.msp + +# ========================= +# Operating System Files +# ========================= + +# OSX +# ========================= + .DS_Store .AppleDouble .LSOverride # Icon must end with two \r -Icon +Icon + # Thumbnails ._* diff --git a/AHK/rapid_t.ahk b/AHK/rapid_t.ahk new file mode 100644 index 0000000..42f787f --- /dev/null +++ b/AHK/rapid_t.ahk @@ -0,0 +1,11 @@ +#NoEnv ; Recommended for performance and compatibility with future AutoHotkey releases. +SendMode Input ; Recommended for new scripts due to its superior speed and reliability. +SetWorkingDir %A_ScriptDir% ; Ensures a consistent starting directory. + ++T:: + While GetKeyState("t", "P") + { + Click WheelDown + Sleep 20 + } +Return \ No newline at end of file diff --git a/Decorner/decorner.py b/Decorner/decorner.py index f18fab5..f8b1d42 100644 --- a/Decorner/decorner.py +++ b/Decorner/decorner.py @@ -3,6 +3,7 @@ import sys imagename = sys.argv[1] image = Image.open(imagename) +image = image.convert('RGBA') w = image.size[0] - 1 h = image.size[1] - 1 for i in range(5): diff --git a/ICO/icoconvert.py b/ICO/icoconvert.py index 23b9b13..fefdb61 100644 --- a/ICO/icoconvert.py +++ b/ICO/icoconvert.py @@ -64,7 +64,8 @@ import binascii import sys from PIL import Image - +import os +print(os.getcwd()) try: INPUTFILE = sys.argv[1] except: diff --git a/Passwordy/passwordy.py b/Passwordy/passwordy.py index 1e4451a..1685504 100644 --- a/Passwordy/passwordy.py +++ b/Passwordy/passwordy.py @@ -33,7 +33,7 @@ HELP_SENTENCE = ''' --------------------------------------------------------------- '''[1:-1] % (DEFAULT_SENTENCE) -def make_password(length=None, allowpunctuation=False, allowdigits=False): +def make_password(length=None, allowpunctuation=False, allowdigits=False, digits_only=False, binary=False): ''' Returns a string of length `length` consisting of a random selection of uppercase and lowercase letters, as well as punctuation and digits @@ -41,12 +41,17 @@ def make_password(length=None, allowpunctuation=False, allowdigits=False): ''' if length is None: length = DEFAULT_LENGTH - - s = string.ascii_letters - if allowpunctuation is True: - s += string.punctuation - if allowdigits is True: - s += string.digits + + if digits_only is False and binary is False: + s = string.ascii_letters + if allowpunctuation is True: + s += string.punctuation + if allowdigits is True: + s += string.digits + elif digits_only: + s = string.digits + elif binary: + s = '01' password = ''.join([random.choice(s) for x in range(length)]) return password @@ -100,7 +105,9 @@ if __name__ == '__main__': if mode == 'password': punc = 'p' in args digi = 'd' in args - print(make_password(length, punc, digi)) + digi_only = 'dd' in args + binary = 'b' in args + print(make_password(length, punc, digi, digi_only, binary)) elif mode == 'sentence': if argc == 3: diff --git a/ProjectileParabola/projectileparabola.py b/ProjectileParabola/projectileparabola.py index eedc1a3..6c042a4 100644 --- a/ProjectileParabola/projectileparabola.py +++ b/ProjectileParabola/projectileparabola.py @@ -16,7 +16,7 @@ def quadratic_formula(a, b, c): discriminant = math.sqrt(discriminant) b *= -1 possible = (b + discriminant, b - discriminant) - possible = [x / (2*a) for x in possible] + possible = tuple(x / (2*a) for x in possible) return possible def time_to_known_distance(velocity, distance, acceleration): @@ -29,9 +29,11 @@ def time_to_known_distance(velocity, distance, acceleration): return min(possible) def make_throw(starting_x, starting_y, starting_velocity, thrown_angle): + # We don't track smallest_y because it's going to be 0! global smallest_x global largest_x global largest_y + upward = thrown_angle in range(1, 179, 1) or thrown_angle in range(-181, -359, -1) upward = -1 if upward else 1 @@ -40,13 +42,6 @@ def make_throw(starting_x, starting_y, starting_velocity, thrown_angle): cos = math.cos(rads) tan = math.tan(rads) - throw = {'angle': thrown_angle} - throw['horizontal_component'] = starting_velocity * cos * -upward - throw['vertical_component'] = starting_velocity * sin * upward - #print(thrown_angle, starting_velocity, throw['horizontal_component']) - throw['hang_time'] = time_to_known_distance(throw['vertical_component'], starting_y, acceleration=9.8) - throw['distance'] = throw['hang_time'] * throw['horizontal_component'] - def parabola(x): # 100% credit goes to wikipedia authors # https://en.wikipedia.org/wiki/Projectile_motion#Parabolic_equation @@ -56,15 +51,19 @@ def make_throw(starting_x, starting_y, starting_velocity, thrown_angle): y = left - (numerator / denominator) return y + throw = {'angle': thrown_angle} throw['parabola'] = parabola + throw['horizontal_component'] = starting_velocity * cos * -upward + throw['vertical_component'] = starting_velocity * sin * upward + throw['hang_time'] = time_to_known_distance(throw['vertical_component'], starting_y, acceleration=9.8) + throw['distance'] = throw['hang_time'] * throw['horizontal_component'] throw['parabola_points'] = [] - #print(throw['vertical_component'], throw['hang_time']) y = 1 x = starting_x backwards = (thrown_angle in range(90, 270)) or (thrown_angle in range(-90, -270, -1)) while y > 0: - y = throw['parabola'](x) + starting_y + y = parabola(x) + starting_y if y < 0: # To keep a smooth floor of 0, rescale the active x so that # it looks like it continues in the right direction underground. @@ -77,7 +76,7 @@ def make_throw(starting_x, starting_y, starting_velocity, thrown_angle): if (smallest_x is None or x < smallest_x): smallest_x = math.floor(x) if (largest_x is None or x > largest_x): largest_x = math.ceil(x) if (largest_y is None or y > largest_y): largest_y = math.ceil(y) - throw['parabola_points'].append([int(x), int(y)]) + throw['parabola_points'].append((int(x), int(y))) if backwards: x -= PLOT_STEP_X else: @@ -141,12 +140,12 @@ for (index, t) in enumerate(throws): point_a = None for pointindex in range(len(t['parabola_points']) - 1): if point_a is None: - point_a = t['parabola_points'][pointindex][:] + point_a = list(t['parabola_points'][pointindex]) point_a[0] = (round(point_a[0])) + abs(smallest_x) + PLOT_PAD_LEFT point_a[1] = (largest_y - round(point_a[1])) else: point_a = point_b - point_b = t['parabola_points'][pointindex + 1][:] + point_b = list(t['parabola_points'][pointindex + 1]) point_b[0] = (round(point_b[0])) + abs(smallest_x) + PLOT_PAD_LEFT point_b[1] = (largest_y - round(point_b[1])) try: diff --git a/ProjectileParabola/projectiles.png b/ProjectileParabola/projectiles.png index efd2900..c8ccb20 100644 --- a/ProjectileParabola/projectiles.png +++ b/ProjectileParabola/projectiles.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:889e3073049ed9d4b0dff0414ab769fbd9cac821177446ef9b5df08d34c8e2f5 -size 8042 +oid sha256:67e8ae6342a582acff7c6fa0904f24c5265a0bb666b132cb505dc56728bfedf9 +size 8054 diff --git a/SnudownTest/.gitignore b/SnudownTest/.gitignore new file mode 100644 index 0000000..4945b7a --- /dev/null +++ b/SnudownTest/.gitignore @@ -0,0 +1,10 @@ +build/ +dist/ +snudown.egg-info/ +src/html_entities.h +*.pyc +*.so +*.so.* +*.o +/fuzzing/bin +/fuzzing/testing diff --git a/SnudownTest/.gitmodules b/SnudownTest/.gitmodules new file mode 100644 index 0000000..e909bd5 --- /dev/null +++ b/SnudownTest/.gitmodules @@ -0,0 +1,4 @@ +[submodule "gumbo_snudown"] + path = fuzzing/gumbo_snudown + url = git@github.com:JordanMilne/gumbo-parser.git + branch = markdown_validation diff --git a/SnudownTest/Python.h b/SnudownTest/Python.h new file mode 100644 index 0000000..2dd8290 --- /dev/null +++ b/SnudownTest/Python.h @@ -0,0 +1,133 @@ +#ifndef Py_PYTHON_H +#define Py_PYTHON_H +/* Since this is a "meta-include" file, no #ifdef __cplusplus / extern "C" { */ + +/* Include nearly all Python header files */ + +#include "patchlevel.h" +#include "pyconfig.h" +#include "pymacconfig.h" + +#include + +#ifndef UCHAR_MAX +#error "Something's broken. UCHAR_MAX should be defined in limits.h." +#endif + +#if UCHAR_MAX != 255 +#error "Python's source code assumes C's unsigned char is an 8-bit type." +#endif + +#if defined(__sgi) && defined(WITH_THREAD) && !defined(_SGI_MP_SOURCE) +#define _SGI_MP_SOURCE +#endif + +#include +#ifndef NULL +# error "Python.h requires that stdio.h define NULL." +#endif + +#include +#ifdef HAVE_ERRNO_H +#include +#endif +#include +#ifdef HAVE_UNISTD_H +#include +#endif + +/* For size_t? */ +#ifdef HAVE_STDDEF_H +#include +#endif + +/* CAUTION: Build setups should ensure that NDEBUG is defined on the + * compiler command line when building Python in release mode; else + * assert() calls won't be removed. + */ +#include + +#include "pyport.h" +#include "pymacro.h" + +#include "pyatomic.h" + +/* Debug-mode build with pymalloc implies PYMALLOC_DEBUG. + * PYMALLOC_DEBUG is in error if pymalloc is not in use. + */ +#if defined(Py_DEBUG) && defined(WITH_PYMALLOC) && !defined(PYMALLOC_DEBUG) +#define PYMALLOC_DEBUG +#endif +#if defined(PYMALLOC_DEBUG) && !defined(WITH_PYMALLOC) +#error "PYMALLOC_DEBUG requires WITH_PYMALLOC" +#endif +#include "pymath.h" +#include "pytime.h" +#include "pymem.h" + +#include "object.h" +#include "objimpl.h" +#include "typeslots.h" +#include "pyhash.h" + +#include "pydebug.h" + +#include "bytearrayobject.h" +#include "bytesobject.h" +#include "unicodeobject.h" +#include "longobject.h" +#include "longintrepr.h" +#include "boolobject.h" +#include "floatobject.h" +#include "complexobject.h" +#include "rangeobject.h" +#include "memoryobject.h" +#include "tupleobject.h" +#include "listobject.h" +#include "dictobject.h" +#include "enumobject.h" +#include "setobject.h" +#include "methodobject.h" +#include "moduleobject.h" +#include "funcobject.h" +#include "classobject.h" +#include "fileobject.h" +#include "pycapsule.h" +#include "traceback.h" +#include "sliceobject.h" +#include "cellobject.h" +#include "iterobject.h" +#include "genobject.h" +#include "descrobject.h" +#include "warnings.h" +#include "weakrefobject.h" +#include "structseq.h" +#include "namespaceobject.h" + +#include "codecs.h" +#include "pyerrors.h" + +#include "pystate.h" + +#include "pyarena.h" +#include "modsupport.h" +#include "pythonrun.h" +#include "ceval.h" +#include "sysmodule.h" +#include "intrcheck.h" +#include "import.h" + +#include "abstract.h" +#include "bltinmodule.h" + +#include "compile.h" +#include "eval.h" + +#include "pyctype.h" +#include "pystrtod.h" +#include "pystrcmp.h" +#include "dtoa.h" +#include "fileutils.h" +#include "pyfpe.h" + +#endif /* !Py_PYTHON_H */ diff --git a/SnudownTest/SECURITY.md b/SnudownTest/SECURITY.md new file mode 100644 index 0000000..4c40dbe --- /dev/null +++ b/SnudownTest/SECURITY.md @@ -0,0 +1,12 @@ +For safety reasons, whenever you add or change something in Snudown, +you should add a few test-cases that demonstrate your change and do a +fuzzing run in `/fuzzing` by running `make afl`. Make sure you have `cmake` +installed and in your `PATH`! + +This uses [American Fuzzy Lop](http://lcamtuf.coredump.cx/afl/) and a +modified [Google Gumbo](https://github.com/google/gumbo-parser/) to ensure +there is no way to generate invalid HTML, and that there are no unsafe +memory operations. + +See [American Fuzzy Lop](http://lcamtuf.coredump.cx/afl/)'s instructions +for your platform to get started. diff --git a/SnudownTest/autolink.c b/SnudownTest/autolink.c new file mode 100644 index 0000000..8d0e39a --- /dev/null +++ b/SnudownTest/autolink.c @@ -0,0 +1,487 @@ +/* + * Copyright (c) 2011, Vicent Marti + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "buffer.h" +#include "autolink.h" + +#include +#include +#include +#include + +#if defined(_WIN32) +#define strncasecmp _strnicmp +#endif + +int +sd_autolink_issafe(const uint8_t *link, size_t link_len) +{ + static const size_t valid_uris_count = 14; + static const char *valid_uris[] = { + "http://", "https://", "ftp://", "mailto://", + "/", "git://", "steam://", "irc://", "news://", "mumble://", + "ssh://", "ircs://", "ts3server://", "#" + }; + + size_t i; + + for (i = 0; i < valid_uris_count; ++i) { + size_t len = strlen(valid_uris[i]); + + if (link_len > len && + strncasecmp((char *)link, valid_uris[i], len) == 0 && + (isalnum(link[len]) || link[len] == '#' || link[len] == '/' || link[len] == '?')) + return 1; + } + + return 0; +} + +static size_t +autolink_delim(uint8_t *data, size_t link_end, size_t max_rewind, size_t size) +{ + uint8_t cclose, copen = 0; + size_t i; + + for (i = 0; i < link_end; ++i) + if (data[i] == '<') { + link_end = i; + break; + } + + while (link_end > 0) { + uint8_t c = data[link_end - 1]; + + if (c == 0) + break; + + if (strchr("?!.,", c) != NULL) + link_end--; + + else if (c == ';') { + size_t new_end = link_end - 2; + + while (new_end > 0 && isalpha(data[new_end])) + new_end--; + + if (new_end < link_end - 2 && data[new_end] == '&') + link_end = new_end; + else + link_end--; + } + else break; + } + + if (link_end == 0) + return 0; + + cclose = data[link_end - 1]; + + switch (cclose) { + case '"': copen = '"'; break; + case '\'': copen = '\''; break; + case ')': copen = '('; break; + case ']': copen = '['; break; + case '}': copen = '{'; break; + } + + if (copen != 0) { + size_t closing = 0; + size_t opening = 0; + size_t i = 0; + + /* Try to close the final punctuation sign in this same line; + * if we managed to close it outside of the URL, that means that it's + * not part of the URL. If it closes inside the URL, that means it + * is part of the URL. + * + * Examples: + * + * foo http://www.pokemon.com/Pikachu_(Electric) bar + * => http://www.pokemon.com/Pikachu_(Electric) + * + * foo (http://www.pokemon.com/Pikachu_(Electric)) bar + * => http://www.pokemon.com/Pikachu_(Electric) + * + * foo http://www.pokemon.com/Pikachu_(Electric)) bar + * => http://www.pokemon.com/Pikachu_(Electric)) + * + * (foo http://www.pokemon.com/Pikachu_(Electric)) bar + * => foo http://www.pokemon.com/Pikachu_(Electric) + */ + + while (i < link_end) { + if (data[i] == copen) + opening++; + else if (data[i] == cclose) + closing++; + + i++; + } + + if (closing != opening) + link_end--; + } + + return link_end; +} + +/* + * Checks that `prefix_char` occurs on a word boundary just before `data`, + * where `data` points to the character to search to the left of, and a word boundary + * is (currently) a whitespace character, punctuation, or the start of the string. + * Returns the length of the prefix. + */ +static int +check_reddit_autolink_prefix( + const uint8_t* data, + size_t max_rewind, + size_t max_lookbehind, + size_t size, + char prefix_char + ) +{ + /* Make sure this `/` is part of `/?r/` */ + if (size < 2 || max_rewind < 1 || data[-1] != prefix_char) + return 0; + + /* Not at the start of the buffer, no inlines to the immediate left of the `prefix_char` */ + if (max_rewind > 1) { + const char boundary = data[-2]; + if (boundary == '/') + return 2; + /** + * Here's where our lack of unicode-awareness bites us. We don't correctly + * match punctuation / whitespace characters for the boundary, because we + * reject valid cases like "。r/example" (note the fullwidth period.) + * + * A better implementation might try to rewind over bytes with the 8th bit set, try + * to decode them to a valid codepoint, then do a unicode-aware check on the codepoint. + */ + else if (ispunct(boundary) || isspace(boundary)) + return 1; + else + return 0; + } else if (max_lookbehind > 2) { + /* There's an inline element just left of the `prefix_char`, is it an escaped forward + * slash? bail out so we correctly handle stuff like "\/r/foo". This will also correctly + * allow "\\/r/foo". + */ + if (data[-2] == '/' && data[-3] == '\\') + return 0; + } + + /* Must be a new-style shortlink with nothing relevant to the left of it. */ + return 1; +} + +static size_t +check_domain(uint8_t *data, size_t size, int allow_short) +{ + size_t i, np = 0; + + if (!isalnum(data[0])) + return 0; + + for (i = 1; i < size - 1; ++i) { + if (data[i] == '.') np++; + else if (!isalnum(data[i]) && data[i] != '-') break; + } + + if (allow_short) { + /* We don't need a valid domain in the strict sense (with + * least one dot; so just make sure it's composed of valid + * domain characters and return the length of the the valid + * sequence. */ + return i; + } else { + /* a valid domain needs to have at least a dot. + * that's as far as we get */ + return np ? i : 0; + } +} + +size_t +sd_autolink__www( + size_t *rewind_p, + struct buf *link, + uint8_t *data, + size_t max_rewind, + size_t size, + unsigned int flags) +{ + size_t link_end; + + if (max_rewind > 0 && !ispunct(data[-1]) && !isspace(data[-1])) + return 0; + + if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0) + return 0; + + link_end = check_domain(data, size, 0); + + if (link_end == 0) + return 0; + + while (link_end < size && !isspace(data[link_end])) + link_end++; + + link_end = autolink_delim(data, link_end, max_rewind, size); + + if (link_end == 0) + return 0; + + bufput(link, data, link_end); + *rewind_p = 0; + + return (int)link_end; +} + +size_t +sd_autolink__email( + size_t *rewind_p, + struct buf *link, + uint8_t *data, + size_t max_rewind, + size_t size, + unsigned int flags) +{ + size_t link_end, rewind; + int nb = 0, np = 0; + + for (rewind = 0; rewind < max_rewind; ++rewind) { + uint8_t c = data[-rewind - 1]; + + if (c == 0) + break; + + if (isalnum(c)) + continue; + + if (strchr(".+-_", c) != NULL) + continue; + + break; + } + + if (rewind == 0) + return 0; + + for (link_end = 0; link_end < size; ++link_end) { + uint8_t c = data[link_end]; + + if (isalnum(c)) + continue; + + if (c == '@') + nb++; + else if (c == '.' && link_end < size - 1) + np++; + else if (c != '-' && c != '_') + break; + } + + if (link_end < 2 || nb != 1 || np == 0) + return 0; + + link_end = autolink_delim(data, link_end, max_rewind, size); + + if (link_end == 0) + return 0; + + bufput(link, data - rewind, link_end + rewind); + *rewind_p = rewind; + + return link_end; +} + +size_t +sd_autolink__url( + size_t *rewind_p, + struct buf *link, + uint8_t *data, + size_t max_rewind, + size_t size, + unsigned int flags) +{ + size_t link_end, rewind = 0, domain_len; + + if (size < 4 || data[1] != '/' || data[2] != '/') + return 0; + + while (rewind < max_rewind && isalpha(data[-rewind - 1])) + rewind++; + + if (!sd_autolink_issafe(data - rewind, size + rewind)) + return 0; + + link_end = strlen("://"); + + domain_len = check_domain( + data + link_end, + size - link_end, + flags & SD_AUTOLINK_SHORT_DOMAINS); + + if (domain_len == 0) + return 0; + + link_end += domain_len; + while (link_end < size && !isspace(data[link_end])) + link_end++; + + link_end = autolink_delim(data, link_end, max_rewind, size); + + if (link_end == 0) + return 0; + + bufput(link, data - rewind, link_end + rewind); + *rewind_p = rewind; + + return link_end; +} + +size_t +sd_autolink__subreddit( + size_t *rewind_p, + struct buf *link, + uint8_t *data, + size_t max_rewind, + size_t max_lookbehind, + size_t size, + int *no_slash + ) +{ + /** + * This is meant to handle both r/foo and /r/foo style subreddit references. + * In a valid /?r/ link, `*data` will always point to the '/' after the first 'r'. + * In pseudo-regex, this matches something like: + * + * `(/|(?<=\b))r/(all-)?%subreddit%([-+]%subreddit%)*(/[\w\-/]*)?` + * where %subreddit% == `((t:)?\w{2,24}|reddit\.com)` + */ + size_t link_end; + size_t rewind; + int is_allminus = 0; + + rewind = check_reddit_autolink_prefix(data, max_rewind, max_lookbehind, size, 'r'); + if (!rewind) + return 0; + + /* offset to the "meat" of the link */ + link_end = strlen("/"); + + if (size >= link_end + 4 && strncasecmp((char*)data + link_end, "all-", 4) == 0) + is_allminus = 1; + + do { + size_t start = link_end; + int max_length = 24; + + /* special case: /r/reddit.com (only subreddit containing '.'). */ + if ( size >= link_end+10 && strncasecmp((char*)data+link_end, "reddit.com", 10) == 0 ) { + link_end += 10; + /* Make sure there are no trailing characters (don't do + * any autolinking for /r/reddit.commission) */ + max_length = 10; + } + + /* If not a special case, verify it begins with (t:)?[A-Za-z0-9] */ + else { + /* support autolinking to timereddits, /r/t:when (1 April 2012) */ + if ( size > link_end+2 && strncasecmp((char*)data+link_end, "t:", 2) == 0 ) + link_end += 2; /* Jump over the 't:' */ + + /* the first character of a subreddit name must be a letter or digit */ + if (!isalnum(data[link_end])) + return 0; + link_end += 1; + } + + /* consume valid characters ([A-Za-z0-9_]) until we run out */ + while (link_end < size && (isalnum(data[link_end]) || + data[link_end] == '_')) + link_end++; + + /* valid subreddit names are between 3 and 21 characters, with + * some subreddits having 2-character names. Don't bother with + * autolinking for anything outside this length range. + * (chksrname function in reddit/.../validator.py) */ + if ( link_end-start < 2 || link_end-start > max_length ) + return 0; + + /* If we are linking to a multireddit, continue */ + } while ( link_end < size && (data[link_end] == '+' || (is_allminus && data[link_end] == '-')) && link_end++ ); + + if (link_end < size && data[link_end] == '/') { + while (link_end < size && (isalnum(data[link_end]) || + data[link_end] == '_' || + data[link_end] == '/' || + data[link_end] == '-')) + link_end++; + } + + /* make the link */ + bufput(link, data - rewind, link_end + rewind); + + *no_slash = (rewind == 1); + *rewind_p = rewind; + + return link_end; +} + +size_t +sd_autolink__username( + size_t *rewind_p, + struct buf *link, + uint8_t *data, + size_t max_rewind, + size_t max_lookbehind, + size_t size, + int *no_slash + ) +{ + size_t link_end; + size_t rewind; + + if (size < 3) + return 0; + + rewind = check_reddit_autolink_prefix(data, max_rewind, max_lookbehind, size, 'u'); + if (!rewind) + return 0; + + link_end = strlen("/"); + + /* the first letter of a username must... well, be valid, we don't care otherwise */ + if (!isalnum(data[link_end]) && data[link_end] != '_' && data[link_end] != '-') + return 0; + link_end += 1; + + /* consume valid characters ([A-Za-z0-9_-/]) until we run out */ + while (link_end < size && (isalnum(data[link_end]) || + data[link_end] == '_' || + data[link_end] == '/' || + data[link_end] == '-')) + link_end++; + + /* make the link */ + bufput(link, data - rewind, link_end + rewind); + + *no_slash = (rewind == 1); + *rewind_p = rewind; + + return link_end; +} diff --git a/SnudownTest/autolink.h b/SnudownTest/autolink.h new file mode 100644 index 0000000..55b7aaa --- /dev/null +++ b/SnudownTest/autolink.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2011, Vicent Marti + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef UPSKIRT_AUTOLINK_H +#define UPSKIRT_AUTOLINK_H + +#include "buffer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + SD_AUTOLINK_SHORT_DOMAINS = (1 << 0), +}; + +int +sd_autolink_issafe(const uint8_t *link, size_t link_len); + +size_t +sd_autolink__www(size_t *rewind_p, struct buf *link, + uint8_t *data, size_t max_rewind, size_t size, unsigned int flags); + +size_t +sd_autolink__email(size_t *rewind_p, struct buf *link, + uint8_t *data, size_t max_rewind, size_t size, unsigned int flags); + +size_t +sd_autolink__url(size_t *rewind_p, struct buf *link, + uint8_t *data, size_t max_rewind, size_t size, unsigned int flags); + +extern size_t +sd_autolink__subreddit(size_t *rewind_p, struct buf *link, uint8_t *data, + size_t max_rewind, size_t max_lookbehind, size_t size, int *no_slash); + +extern size_t +sd_autolink__username(size_t *rewind_p, struct buf *link, uint8_t *data, + size_t max_rewind, size_t max_lookbehind, size_t size, int *no_slash); + +#ifdef __cplusplus +} +#endif + +#endif + +/* vim: set filetype=c: */ diff --git a/SnudownTest/buffer.c b/SnudownTest/buffer.c new file mode 100644 index 0000000..ab18948 --- /dev/null +++ b/SnudownTest/buffer.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2008, Natacha Porté + * Copyright (c) 2011, Vicent Martí + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#define BUFFER_MAX_ALLOC_SIZE (1024 * 1024 * 16) //16mb + +#include "buffer.h" + +#include +#include +#include +#include + +/* MSVC compat */ +#if defined(_MSC_VER) +# define _buf_vsnprintf _vsnprintf +#else +# define _buf_vsnprintf vsnprintf +#endif + +int +bufprefix(const struct buf *buf, const char *prefix) +{ + size_t i; + assert(buf && buf->unit); + + for (i = 0; i < buf->size; ++i) { + if (prefix[i] == 0) + return 0; + + if (buf->data[i] != prefix[i]) + return buf->data[i] - prefix[i]; + } + + return 0; +} + +/* bufgrow: increasing the allocated size to the given value */ +int +bufgrow(struct buf *buf, size_t neosz) +{ + size_t neoasz; + void *neodata; + + assert(buf && buf->unit); + + if (neosz > BUFFER_MAX_ALLOC_SIZE) + return BUF_ENOMEM; + + if (buf->asize >= neosz) + return BUF_OK; + + neoasz = buf->asize + buf->unit; + while (neoasz < neosz) + neoasz += buf->unit; + + neodata = realloc(buf->data, neoasz); + if (!neodata) + return BUF_ENOMEM; + + buf->data = neodata; + buf->asize = neoasz; + return BUF_OK; +} + + +/* bufnew: allocation of a new buffer */ +struct buf * +bufnew(size_t unit) +{ + struct buf *ret; + ret = malloc(sizeof (struct buf)); + + if (ret) { + ret->data = 0; + ret->size = ret->asize = 0; + ret->unit = unit; + } + return ret; +} + +/* bufnullterm: NULL-termination of the string array */ +const char * +bufcstr(struct buf *buf) +{ + assert(buf && buf->unit); + + if (buf->size < buf->asize && buf->data[buf->size] == 0) + return (char *)buf->data; + + if (buf->size + 1 <= buf->asize || bufgrow(buf, buf->size + 1) == 0) { + buf->data[buf->size] = 0; + return (char *)buf->data; + } + + return NULL; +} + +/* bufprintf: formatted printing to a buffer */ +void +bufprintf(struct buf *buf, const char *fmt, ...) +{ + va_list ap; + int n; + + assert(buf && buf->unit); + + if (buf->size >= buf->asize && bufgrow(buf, buf->size + 1) < 0) + return; + va_start(ap, fmt); + n = _buf_vsnprintf((char *)buf->data + buf->size, buf->asize - buf->size, fmt, ap); + va_end(ap); + + if (n < 0) { +#ifdef _MSC_VER + va_start(ap, fmt); + n = _vscprintf(fmt, ap); + va_end(ap); +#else + return; +#endif + } + if ((size_t)n >= buf->asize - buf->size) { + if (bufgrow(buf, buf->size + n + 1) < 0) + return; + + va_start(ap, fmt); + n = _buf_vsnprintf((char *)buf->data + buf->size, buf->asize - buf->size, fmt, ap); + va_end(ap); + } + + if (n < 0) + return; + + buf->size += n; +} + +/* bufput: appends raw data to a buffer */ +void +bufput(struct buf *buf, const void *data, size_t len) +{ + assert(buf && buf->unit); + + if (buf->size + len > buf->asize && bufgrow(buf, buf->size + len) < 0) + return; + + memcpy(buf->data + buf->size, data, len); + buf->size += len; +} + +/* bufputs: appends a NUL-terminated string to a buffer */ +void +bufputs(struct buf *buf, const char *str) +{ + bufput(buf, str, strlen(str)); +} + + +/* bufputc: appends a single uint8_t to a buffer */ +void +bufputc(struct buf *buf, int c) +{ + assert(buf && buf->unit); + + if (buf->size + 1 > buf->asize && bufgrow(buf, buf->size + 1) < 0) + return; + + buf->data[buf->size] = c; + buf->size += 1; +} + +/* bufrelease: decrease the reference count and free the buffer if needed */ +void +bufrelease(struct buf *buf) +{ + if (!buf) + return; + + free(buf->data); + free(buf); +} + + +/* bufreset: frees internal data of the buffer */ +void +bufreset(struct buf *buf) +{ + if (!buf) + return; + + free(buf->data); + buf->data = NULL; + buf->size = buf->asize = 0; +} + +/* bufslurp: removes a given number of bytes from the head of the array */ +void +bufslurp(struct buf *buf, size_t len) +{ + assert(buf && buf->unit); + + if (len >= buf->size) { + buf->size = 0; + return; + } + + buf->size -= len; + memmove(buf->data, buf->data + len, buf->size); +} + +/* buftrucate: truncates the buffer at `size` */ +int +buftruncate(struct buf *buf, size_t size) +{ + if (buf->size < size || size < 0) { + /* bail out in debug mode so we can figure out why this happened */ + assert(0); + return BUF_EINVALIDIDX; + } + + buf->size = size; + return BUF_OK; +} diff --git a/SnudownTest/buffer.h b/SnudownTest/buffer.h new file mode 100644 index 0000000..ab98ab6 --- /dev/null +++ b/SnudownTest/buffer.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2008, Natacha Porté + * Copyright (c) 2011, Vicent Martí + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef BUFFER_H__ +#define BUFFER_H__ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_MSC_VER) +#define __attribute__(x) +#define inline +#endif + +typedef enum { + BUF_OK = 0, + BUF_ENOMEM = -1, + BUF_EINVALIDIDX = -2, +} buferror_t; + +/* struct buf: character array buffer */ +struct buf { + uint8_t *data; /* actual character data */ + size_t size; /* size of the string */ + size_t asize; /* allocated size (0 = volatile buffer) */ + size_t unit; /* reallocation unit size (0 = read-only buffer) */ +}; + +/* CONST_BUF: global buffer from a string litteral */ +#define BUF_STATIC(string) \ + { (uint8_t *)string, sizeof string -1, sizeof string, 0, 0 } + +/* VOLATILE_BUF: macro for creating a volatile buffer on the stack */ +#define BUF_VOLATILE(strname) \ + { (uint8_t *)strname, strlen(strname), 0, 0, 0 } + +/* BUFPUTSL: optimized bufputs of a string litteral */ +#define BUFPUTSL(output, literal) \ + bufput(output, literal, sizeof literal - 1) + +/* bufgrow: increasing the allocated size to the given value */ +int bufgrow(struct buf *, size_t); + +/* bufnew: allocation of a new buffer */ +struct buf *bufnew(size_t) __attribute__ ((malloc)); + +/* bufnullterm: NUL-termination of the string array (making a C-string) */ +const char *bufcstr(struct buf *); + +/* bufprefix: compare the beginning of a buffer with a string */ +int bufprefix(const struct buf *buf, const char *prefix); + +/* bufput: appends raw data to a buffer */ +void bufput(struct buf *, const void *, size_t); + +/* bufputs: appends a NUL-terminated string to a buffer */ +void bufputs(struct buf *, const char *); + +/* bufputc: appends a single char to a buffer */ +void bufputc(struct buf *, int); + +/* bufrelease: decrease the reference count and free the buffer if needed */ +void bufrelease(struct buf *); + +/* bufreset: frees internal data of the buffer */ +void bufreset(struct buf *); + +/* bufslurp: removes a given number of bytes from the head of the array */ +void bufslurp(struct buf *, size_t); + +/* bufprintf: formatted printing to a buffer */ +void bufprintf(struct buf *, const char *, ...) __attribute__ ((format (printf, 2, 3))); + +/* buftruncate: truncates the buffer at `size` */ +int buftruncate(struct buf *buf, size_t size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/SnudownTest/debian/changelog b/SnudownTest/debian/changelog new file mode 100644 index 0000000..e8dce65 --- /dev/null +++ b/SnudownTest/debian/changelog @@ -0,0 +1,145 @@ +snudown (1.4.0) unstable; urgency=medium + + * autolink r/subreddit and u/user + * security: don't rewind over previous inlines when autolinking + * email autolinks re-enabled due to ^ + * more stringent character entity checks and sanitization + * properly handle URLs containing control characters + + -- Jordan Milne Mon, 01 Jun 2015 13:04:23 -0700 + +snudown (1.3.2) unstable; urgency=medium + + * fix alphanumeric-named entities + + -- Neil Williams Wed, 25 Feb 2015 13:32:41 -0800 + +snudown (1.3.1) unstable; urgency=medium + + * add missing entities to entity whitelist + + -- Neil Williams Tue, 24 Feb 2015 22:12:29 -0800 + +snudown (1.3.0) unstable; urgency=medium + + * validate html entities and escape unrecognized ones + + -- Neil Williams Tue, 24 Feb 2015 17:55:38 -0800 + +snudown (1.2.0) unstable; urgency=medium + + * security: fix rewind issues + * email autolinks disabled due to ^ + * security: fix table header OOM bomb + + -- Neil Williams Sat, 20 Sep 2014 11:59:34 -0700 + +snudown (1.1.6) unstable; urgency=low + + * add ts3server url scheme to whitelist + * redo html sanitization for wiki renderer + + -- Neil Williams Tue, 01 Apr 2014 17:12:50 -0700 + +snudown (1.1.5) unstable; urgency=low + + * bring path stuff into user/subreddit autolinking (multis, subpages etc.) + * make /u/ autolinking case sensitive + + -- Neil Williams Wed, 22 May 2013 16:09:31 -0700 + +snudown (1.1.4) unstable; urgency=low + + * make /r/ autolinking case sensitive + + -- Neil Williams Mon, 25 Feb 2013 23:27:10 -0800 + +snudown (1.1.3) unstable; urgency=low + + * add support for /r/all-minus + + -- Neil Williams Tue, 08 Jan 2013 12:55:40 -0800 + +snudown (1.1.2) unstable; urgency=low + + * don't close the toc div if there wasn't a toc :( + + -- Neil Williams Wed, 12 Dec 2012 17:38:05 -0800 + +snudown (1.1.1) unstable; urgency=low + + * minor code cleanup + * add a div around wiki table of contents for styling purposes + + -- Neil Williams Wed, 12 Dec 2012 13:47:49 -0800 + +snudown (1.1.0) unstable; urgency=low + + * add wiki variant of markdown syntax (allows links, and + some raw html) + + -- Neil Williams Wed, 05 Sep 2012 23:30:34 -0700 + +snudown (1.0.7) unstable; urgency=low + + * add python-setuptools to build-depends + + -- Neil Williams Thu, 09 Aug 2012 14:46:49 -0700 + +snudown (1.0.6) unstable; urgency=low + + * made subreddit autolinking more robust thanks to nandhp + * cleaned up packaging + * merged upstream fixes: + * fix blockquotes nested inside paragraphs + * improve parsing of continuous list items + * fix infinite loop parsing strikethrouhgs + + -- Neil Williams Thu, 09 Aug 2012 13:06:38 -0700 + +snudown (1.0.5) unstable; urgency=low + + * require a space between url and title + * merged upstream fixes: + * whitespace after tables prevent them from rendering + * escape html in contents of tables + + -- Neil Williams Thu, 23 Feb 2012 08:40:39 -0800 + +snudown (1.0.4) unstable; urgency=low + + * change username autolinking to /u/username + * properly handle backslash at end of message + + -- Neil Williams Thu, 26 Jan 2012 18:26:45 -0800 + +snudown (1.0.3) unstable; urgency=low + + * ~username auto-linking + * make table headers less strict + * correctly handle ) in link title text + * synced with upstream + * code clean-up + * utf-8 fixes + + -- Neil Williams Wed, 18 Jan 2012 15:20:35 -0800 + +snudown (1.0.2) unstable; urgency=low + + * synced up with upstream + * more safelink relaxation based on community requests + * fixed nesting unordered lists within ordered lists and vice versa + + -- Neil Williams Sat, 19 Nov 2011 17:16:47 -0800 + +snudown (1.0.1) unstable; urgency=low + + * new version, new package + + -- Neil Williams Thu, 17 Nov 2011 14:22:26 -0800 + +snudown (1.0.0) unstable; urgency=low + + * source package automatically created by stdeb 0.6.0+git + + -- Neil Williams Wed, 16 Nov 2011 10:36:53 -0800 diff --git a/SnudownTest/debian/compat b/SnudownTest/debian/compat new file mode 100644 index 0000000..7f8f011 --- /dev/null +++ b/SnudownTest/debian/compat @@ -0,0 +1 @@ +7 diff --git a/SnudownTest/debian/control b/SnudownTest/debian/control new file mode 100644 index 0000000..19a8bd8 --- /dev/null +++ b/SnudownTest/debian/control @@ -0,0 +1,15 @@ +Source: snudown +Maintainer: Neil Williams +Section: python +Priority: optional +Build-Depends: python-all-dev (>= 2.6.6-3), debhelper (>= 7), python-setuptools, gperf +Standards-Version: 3.9.3 +Homepage: https://github.com/reddit/snudown +Vcs-Git: git://github.com/reddit/snudown.git + +Package: python-snudown +Architecture: any +Depends: ${misc:Depends}, ${python:Depends}, ${shlibs:Depends} +Breaks: ${python:Breaks} +Description: reddit's python wrapper and customization of the Sundown Markdown interpreter. + diff --git a/SnudownTest/debian/copyright b/SnudownTest/debian/copyright new file mode 100644 index 0000000..3d301c3 --- /dev/null +++ b/SnudownTest/debian/copyright @@ -0,0 +1,30 @@ +Format: http://dep.debian.net/deps/dep5 +Upstream-Name: snudown +Source: https://github.com/reddit/snudown + +Files: * +Copyright: 2011-2012 Vicent Marti + 2011-2012 reddit Inc. +License: MIT + +Files: debian/* +Copyright: 2011-2012 reddit Inc. +License: MIT + +Files: test_snudown.py +Copyright: 2011-2012 reddit Inc. +License: MIT + +License: MIT + Permission to use, copy, modify, and distribute this software for any purpose + with or without fee is hereby granted, provided that the above copyright + notice and this permission notice appear in all copies. + . + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + PERFORMANCE OF THIS SOFTWARE. + diff --git a/SnudownTest/debian/rules b/SnudownTest/debian/rules new file mode 100644 index 0000000..945a5fe --- /dev/null +++ b/SnudownTest/debian/rules @@ -0,0 +1,9 @@ +#!/usr/bin/make -f + +# This file was automatically generated by stdeb 0.6.0+git at +# Wed, 16 Nov 2011 10:36:53 -0800 + +%: + dh $@ --with python2 --buildsystem=python_distutils + + diff --git a/SnudownTest/debian/source/format b/SnudownTest/debian/source/format new file mode 100644 index 0000000..89ae9db --- /dev/null +++ b/SnudownTest/debian/source/format @@ -0,0 +1 @@ +3.0 (native) diff --git a/SnudownTest/fuzzing/CMakeLists.txt b/SnudownTest/fuzzing/CMakeLists.txt new file mode 100644 index 0000000..5ed60de --- /dev/null +++ b/SnudownTest/fuzzing/CMakeLists.txt @@ -0,0 +1,37 @@ +cmake_minimum_required(VERSION 2.8) + +set(HEADERS + ../html/houdini.h + ../html/html.h + ../src/autolink.h + ../src/buffer.h + ../src/html_blocks.h + ../src/html_entities.h + ../src/markdown.h + ../src/stack.h + ) +set(LIBRARY_SOURCES + ../html/houdini_href_e.c + ../html/houdini_html_e.c + ../html/html.c + ../html/html_smartypants.c + ../src/autolink.c + ../src/buffer.c + ../src/markdown.c + ../src/stack.c + ${HEADERS} + ) + +set(PROGRAM "snudown-validator") +set(PROGRAM_SOURCES + ${LIBRARY_SOURCES} + snudown-validator.c + ) + +include_directories(. ../src ../html ./build/gumbo_snudown/include ${CMAKE_CURRENT_BINARY_DIR}) +link_directories(${CMAKE_CURRENT_SOURCE_DIR}/build/gumbo_snudown/lib) + +add_executable(${PROGRAM} ${PROGRAM_SOURCES}) +target_link_libraries(${PROGRAM} gumbo) + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -g -Wno-error=parentheses") diff --git a/SnudownTest/fuzzing/Makefile b/SnudownTest/fuzzing/Makefile new file mode 100644 index 0000000..5094b13 --- /dev/null +++ b/SnudownTest/fuzzing/Makefile @@ -0,0 +1,62 @@ +# Copyright (c) 2015, reddit inc. +# +# Permission to use, copy, modify, and distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +all: gumbo_snudown snudown-validator + +.PHONY: all clean gumbo_snudown snudown-validator build_dir + +build_dir: + mkdir -p build + +# Our modified gumbo for finding security-relevant syntax issues +gumbo_snudown: build_dir + mkdir -p build/gumbo_snudown + git submodule update --recursive + @[ -f "${CURDIR}/gumbo_snudown/configure" ] || { \ + cd gumbo_snudown; \ + ./autogen.sh; \ + ./configure --prefix=$(CURDIR)/build/gumbo_snudown; \ + } + # Don't build this with AFL instrumentation, I'm assuming Google + # already ran their own fuzzer over their own parser... + $(MAKE) -C gumbo_snudown all install + +gperf_src: + cd ../src/ && gperf html_entities.gperf --output-file=html_entities.h + +# executable +snudown-validator: build_dir gumbo_snudown gperf_src + cd build && cmake .. -DCMAKE_C_COMPILER=$(AFL_PATH)/afl-gcc + $(MAKE) -C build all + +# stuff for fuzzing +gen_testcases: + mkdir -p testing/testcases + rm -f testing/testcases/test_default_*.md + python2.7 gen_testcases.py + +afl: gen_testcases snudown-validator + @[ -n "$(AFL_PATH)" ] || { echo '$$AFL_PATH not set'; false; } + @mkdir -p testing/afl_results + $(AFL_PATH)/afl-fuzz \ + -i testing/testcases \ + -o testing/afl_results \ + -t 100 \ + -m none \ + ./build/snudown-validator + +# housekeeping +clean: + rm -rf *.o + rm -rf build/ diff --git a/SnudownTest/fuzzing/gen_testcases.py b/SnudownTest/fuzzing/gen_testcases.py new file mode 100644 index 0000000..f952192 --- /dev/null +++ b/SnudownTest/fuzzing/gen_testcases.py @@ -0,0 +1,20 @@ +#!/bin/env python + +# dump all of our testcases into a directory as separate files, like AFL +# wants. + +import os.path +import sys +import itertools + +sys.path.append("..") +import test_snudown + +cases = itertools.chain(test_snudown.cases.keys(), test_snudown.wiki_cases.keys()) +for i, md in enumerate(cases): + # skip huge testcases + if len(md) > 2048: + continue + test_path = os.path.join('testing', 'testcases', 'test_default_%d.md' % i) + with open(test_path, 'w') as f: + f.write(md) diff --git a/SnudownTest/fuzzing/snudown-validator.c b/SnudownTest/fuzzing/snudown-validator.c new file mode 100644 index 0000000..153e1c4 --- /dev/null +++ b/SnudownTest/fuzzing/snudown-validator.c @@ -0,0 +1,226 @@ +#include "markdown.h" +#include "html.h" +#include "buffer.h" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#define READ_UNIT 1024 +#define OUTPUT_UNIT 64 + +#include "autolink.h" + +#define SNUDOWN_VERSION "1.3.2" + +enum snudown_renderer_mode { + RENDERER_USERTEXT = 0, + RENDERER_WIKI, + RENDERER_COUNT +}; + +struct snudown_renderopt { + struct html_renderopt html; + int nofollow; + const char *target; +}; + +struct snudown_renderer { + struct sd_markdown* main_renderer; + struct sd_markdown* toc_renderer; + struct module_state* state; + struct module_state* toc_state; +}; + +struct module_state { + struct sd_callbacks callbacks; + struct snudown_renderopt options; +}; + +static struct snudown_renderer sundown[RENDERER_COUNT]; + +static char* html_element_whitelist[] = {"tr", "th", "td", "table", "tbody", "thead", "tfoot", "caption", NULL}; +static char* html_attr_whitelist[] = {"colspan", "rowspan", "cellspacing", "cellpadding", "scope", NULL}; + +static struct module_state usertext_toc_state; +static struct module_state wiki_toc_state; +static struct module_state usertext_state; +static struct module_state wiki_state; + +static const unsigned int snudown_default_md_flags = + MKDEXT_NO_INTRA_EMPHASIS | + MKDEXT_SUPERSCRIPT | + MKDEXT_AUTOLINK | + MKDEXT_STRIKETHROUGH | + MKDEXT_TABLES; + +static const unsigned int snudown_default_render_flags = + HTML_SKIP_HTML | + HTML_SKIP_IMAGES | + HTML_SAFELINK | + HTML_ESCAPE | + HTML_USE_XHTML; + +static const unsigned int snudown_wiki_render_flags = + HTML_SKIP_HTML | + HTML_SAFELINK | + HTML_ALLOW_ELEMENT_WHITELIST | + HTML_ESCAPE | + HTML_USE_XHTML; + +static void +snudown_link_attr(struct buf *ob, const struct buf *link, void *opaque) +{ + struct snudown_renderopt *options = opaque; + + if (options->nofollow) + BUFPUTSL(ob, " rel=\"nofollow\""); + + if (options->target != NULL) { + BUFPUTSL(ob, " target=\""); + bufputs(ob, options->target); + bufputc(ob, '\"'); + } +} + +static struct sd_markdown* make_custom_renderer(struct module_state* state, + const unsigned int renderflags, + const unsigned int markdownflags, + int toc_renderer) { + if(toc_renderer) { + sdhtml_toc_renderer(&state->callbacks, + (struct html_renderopt *)&state->options); + } else { + sdhtml_renderer(&state->callbacks, + (struct html_renderopt *)&state->options, + renderflags); + } + + state->options.html.link_attributes = &snudown_link_attr; + state->options.html.html_element_whitelist = html_element_whitelist; + state->options.html.html_attr_whitelist = html_attr_whitelist; + + return sd_markdown_new( + markdownflags, + 16, + 64, + &state->callbacks, + &state->options + ); +} + +void init_default_renderer() { + sundown[RENDERER_USERTEXT].main_renderer = make_custom_renderer(&usertext_state, snudown_default_render_flags, snudown_default_md_flags, 0); + sundown[RENDERER_USERTEXT].toc_renderer = make_custom_renderer(&usertext_toc_state, snudown_default_render_flags, snudown_default_md_flags, 1); + sundown[RENDERER_USERTEXT].state = &usertext_state; + sundown[RENDERER_USERTEXT].toc_state = &usertext_toc_state; +} + +void init_wiki_renderer() { + sundown[RENDERER_WIKI].main_renderer = make_custom_renderer(&wiki_state, snudown_wiki_render_flags, snudown_default_md_flags, 0); + sundown[RENDERER_WIKI].toc_renderer = make_custom_renderer(&wiki_toc_state, snudown_wiki_render_flags, snudown_default_md_flags, 1); + sundown[RENDERER_WIKI].state = &wiki_state; + sundown[RENDERER_WIKI].toc_state = &wiki_toc_state; +} + +void +snudown_md(struct buf *ob, const uint8_t *document, size_t doc_size, int wiki_mode) +{ + int renderer = RENDERER_USERTEXT; + int enable_toc = 0; + struct snudown_renderer _snudown; + int nofollow = 0; + char* target = NULL; + char* toc_id_prefix = NULL; + unsigned int flags; + + if (wiki_mode) + renderer = RENDERER_WIKI; + + _snudown = sundown[renderer]; + + struct snudown_renderopt *options = &(_snudown.state->options); + options->nofollow = nofollow; + options->target = target; + + flags = options->html.flags; + + if (enable_toc) { + _snudown.toc_state->options.html.toc_id_prefix = toc_id_prefix; + sd_markdown_render(ob, document, doc_size, _snudown.toc_renderer); + _snudown.toc_state->options.html.toc_id_prefix = NULL; + + options->html.flags |= HTML_TOC; + } + + options->html.toc_id_prefix = toc_id_prefix; + + /* do the magic */ + sd_markdown_render(ob, document, doc_size, _snudown.main_renderer); + + options->html.toc_id_prefix = NULL; + options->html.flags = flags; +} +int +main(int argc, char **argv) +{ + init_default_renderer(); + init_wiki_renderer(); + + struct buf *ib, *ob; + int size_read = 0, wiki_mode = 0, i = 0, have_errors = 0; + + /* reading everything */ + ib = bufnew(READ_UNIT); + bufgrow(ib, READ_UNIT); + while ((size_read = fread(ib->data + ib->size, 1, ib->asize - ib->size, stdin)) > 0) { + ib->size += size_read; + bufgrow(ib, ib->size + READ_UNIT); + } + /* Render to a buffer, then print that out */ + ob = bufnew(OUTPUT_UNIT); + bufputs(ob, "\n"); + snudown_md(ob, ib->data, ib->size, wiki_mode); + bufputs(ob, "\n"); + + // Wiki mode explicitly allows unbalanced tags, need some way to exclude those + if (!wiki_mode) { + GumboOutput* output = gumbo_parse_with_options(&kGumboDefaultOptions, bufcstr(ob), ob->size); + + for (i=0; i < output->errors.length; ++i) { + // stupid "public" API I hacked in. + void* thing = output->errors.data[i]; + GumboErrorType type = gumbo_get_error_type(thing); + switch(type) { + case GUMBO_ERR_UTF8_INVALID: + case GUMBO_ERR_UTF8_NULL: + // Making sure the user gave us valid + // utf-8 or transforming it to valid + // utf-8 is outside the scope of snudown + continue; + default: + have_errors = 1; + printf("%s\n", GUMBO_ERROR_NAMES[type]); + printf("%s\n",gumbo_get_error_text(thing)); + printf("===============\n"); + break; + } + } + + if (have_errors) { + // gotta trigger a crash for AFL to catch it + assert(0); + } + + gumbo_destroy_output(&kGumboDefaultOptions, output); + } + bufrelease(ob); + bufrelease(ib); + return 0; +} diff --git a/SnudownTest/fuzzing/triageerrors.sh b/SnudownTest/fuzzing/triageerrors.sh new file mode 100644 index 0000000..0714aba --- /dev/null +++ b/SnudownTest/fuzzing/triageerrors.sh @@ -0,0 +1,2 @@ +#!/bin/bash +find testing/afl_results/ -regextype posix-egrep -regex ".*/(crashes|hangs)/.*" | xargs -I '{}' ./validatemd.sh {} diff --git a/SnudownTest/fuzzing/validatemd.sh b/SnudownTest/fuzzing/validatemd.sh new file mode 100644 index 0000000..3df6c26 --- /dev/null +++ b/SnudownTest/fuzzing/validatemd.sh @@ -0,0 +1,3 @@ +#!/bin/bash +echo "** ${1}" +./build/snudown-validator < $1 diff --git a/SnudownTest/gperf.exe b/SnudownTest/gperf.exe new file mode 100644 index 0000000..ff10d02 --- /dev/null +++ b/SnudownTest/gperf.exe @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f9266ea2d2bd19a503b5d2ec613e983c6ed9ea45ff6b5820b0681fd1b778d12 +size 103424 diff --git a/SnudownTest/houdini.h b/SnudownTest/houdini.h new file mode 100644 index 0000000..b4954c0 --- /dev/null +++ b/SnudownTest/houdini.h @@ -0,0 +1,37 @@ +#ifndef HOUDINI_H__ +#define HOUDINI_H__ + +#include "buffer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef HOUDINI_USE_LOCALE +# define _isxdigit(c) isxdigit(c) +# define _isdigit(c) isdigit(c) +#else +/* + * Helper _isdigit methods -- do not trust the current locale + * */ +# define _isxdigit(c) (strchr("0123456789ABCDEFabcdef", (c)) != NULL) +# define _isdigit(c) ((c) >= '0' && (c) <= '9') +#endif + +extern void houdini_escape_html(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_escape_html0(struct buf *ob, const uint8_t *src, size_t size, int secure); +extern void houdini_unescape_html(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_escape_xml(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_escape_uri(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_escape_url(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_escape_href(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_unescape_uri(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_unescape_url(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_escape_js(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_unescape_js(struct buf *ob, const uint8_t *src, size_t size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/SnudownTest/houdini_href_e.c b/SnudownTest/houdini_href_e.c new file mode 100644 index 0000000..581df1f --- /dev/null +++ b/SnudownTest/houdini_href_e.c @@ -0,0 +1,116 @@ +#include +#include +#include + +#include "houdini.h" + +#define ESCAPE_GROW_FACTOR(x) (((x) * 12) / 10) + +/* + * The following characters will not be escaped: + * + * -_.+!*'(),%#@?=;:/,+&$ alphanum + * + * Note that this character set is the addition of: + * + * - The characters which are safe to be in an URL + * - The characters which are *not* safe to be in + * an URL because they are RESERVED characters. + * + * We asume (lazily) that any RESERVED char that + * appears inside an URL is actually meant to + * have its native function (i.e. as an URL + * component/separator) and hence needs no escaping. + * + * There are two exceptions: the chacters & (amp) + * and ' (single quote) do not appear in the table. + * They are meant to appear in the URL as components, + * yet they require special HTML-entity escaping + * to generate valid HTML markup. + * + * All other characters will be escaped to %XX. + * + */ +static const char HREF_SAFE[] = { + 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +void +houdini_escape_href(struct buf *ob, const uint8_t *src, size_t size) +{ + static const char hex_chars[] = "0123456789ABCDEF"; + size_t i = 0, org; + char hex_str[3]; + + bufgrow(ob, ESCAPE_GROW_FACTOR(size)); + hex_str[0] = '%'; + + while (i < size) { + org = i; + /* Skip by characters that don't need special + * processing */ + while (i < size && HREF_SAFE[src[i]] == 1) + i++; + + if (i > org) + bufput(ob, src + org, i - org); + + /* escaping */ + if (i >= size) + break; + + /* throw out control characters */ + if (HREF_SAFE[src[i]] == 2) { + i++; + continue; + } + + switch (src[i]) { + /* amp appears all the time in URLs, but needs + * HTML-entity escaping to be inside an href */ + case '&': + BUFPUTSL(ob, "&"); + break; + + /* the single quote is a valid URL character + * according to the standard; it needs HTML + * entity escaping too */ + case '\'': + BUFPUTSL(ob, "'"); + break; + + /* the space can be escaped to %20 or a plus + * sign. we're going with the generic escape + * for now. the plus thing is more commonly seen + * when building GET strings */ +#if 0 + case ' ': + bufputc(ob, '+'); + break; +#endif + + /* every other character goes with a %XX escaping */ + default: + hex_str[1] = hex_chars[(src[i] >> 4) & 0xF]; + hex_str[2] = hex_chars[src[i] & 0xF]; + bufput(ob, hex_str, 3); + } + + i++; + } +} diff --git a/SnudownTest/houdini_html_e.c b/SnudownTest/houdini_html_e.c new file mode 100644 index 0000000..085c4bf --- /dev/null +++ b/SnudownTest/houdini_html_e.c @@ -0,0 +1,87 @@ +#include +#include +#include + +#include "houdini.h" + +#define ESCAPE_GROW_FACTOR(x) (((x) * 12) / 10) /* this is very scientific, yes */ + +/** + * According to the OWASP rules: + * + * & --> & + * < --> < + * > --> > + * " --> " + * ' --> ' ' is not recommended + * / --> / forward slash is included as it helps end an HTML entity + * + */ +static const char HTML_ESCAPE_TABLE[] = { + 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 7, 7, 0, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 0, 0, 1, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 6, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static const char *HTML_ESCAPES[] = { + "", + """, + "&", + "'", + "/", + "<", + ">", + "", // throw out control characters +}; + +void +houdini_escape_html0(struct buf *ob, const uint8_t *src, size_t size, int secure) +{ + size_t i = 0, org, esc = 0; + + bufgrow(ob, ESCAPE_GROW_FACTOR(size)); + + while (i < size) { + org = i; + while (i < size && (esc = HTML_ESCAPE_TABLE[src[i]]) == 0) + i++; + + if (i > org) + bufput(ob, src + org, i - org); + + /* escaping */ + if (i >= size) + break; + + /* The forward slash is only escaped in secure mode */ + if (src[i] == '/' && !secure) { + bufputc(ob, '/'); + } else if (HTML_ESCAPE_TABLE[src[i]] == 7) { + /* skip control characters */ + } else { + bufputs(ob, HTML_ESCAPES[esc]); + } + + i++; + } +} + +void +houdini_escape_html(struct buf *ob, const uint8_t *src, size_t size) +{ + houdini_escape_html0(ob, src, size, 1); +} + diff --git a/SnudownTest/html.c b/SnudownTest/html.c new file mode 100644 index 0000000..eebccc1 --- /dev/null +++ b/SnudownTest/html.c @@ -0,0 +1,790 @@ +/* + * Copyright (c) 2009, Natacha Porté + * Copyright (c) 2011, Vicent Marti + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "markdown.h" +#include "html.h" + +#include +#include +#include +#include +#include + +#include "houdini.h" + +#define USE_XHTML(opt) (opt->flags & HTML_USE_XHTML) + +int +sdhtml_is_tag(const uint8_t *tag_data, size_t tag_size, const char *tagname) +{ + size_t i; + int closed = 0; + + if (tag_size < 3 || tag_data[0] != '<') + return HTML_TAG_NONE; + + i = 1; + + if (tag_data[i] == '/') { + closed = 1; + i++; + } + + for (; i < tag_size; ++i, ++tagname) { + if (*tagname == 0) + break; + + if (tag_data[i] != *tagname) + return HTML_TAG_NONE; + } + + if (i == tag_size) + return HTML_TAG_NONE; + + if (isspace(tag_data[i]) || tag_data[i] == '>') + return closed ? HTML_TAG_CLOSE : HTML_TAG_OPEN; + + return HTML_TAG_NONE; +} + +static inline void escape_html(struct buf *ob, const uint8_t *source, size_t length) +{ + houdini_escape_html0(ob, source, length, 0); +} + +static inline void escape_href(struct buf *ob, const uint8_t *source, size_t length) +{ + houdini_escape_href(ob, source, length); +} + +/******************** + * GENERIC RENDERER * + ********************/ +static int +rndr_autolink(struct buf *ob, const struct buf *link, enum mkd_autolink type, void *opaque) +{ + struct html_renderopt *options = opaque; + uint8_t offset = 0; + + if (!link || !link->size) + return 0; + + if ((options->flags & HTML_SAFELINK) != 0 && + !sd_autolink_issafe(link->data, link->size) && + type != MKDA_EMAIL) + return 0; + + BUFPUTSL(ob, "data + offset, link->size - offset); + + if (options->link_attributes) { + bufputc(ob, '\"'); + options->link_attributes(ob, link, opaque); + bufputc(ob, '>'); + } else { + BUFPUTSL(ob, "\">"); + } + + /* + * Pretty printing: if we get an email address as + * an actual URI, e.g. `mailto:foo@bar.com`, we don't + * want to print the `mailto:` prefix + */ + if (bufprefix(link, "mailto:") == 0) { + escape_html(ob, link->data + 7, link->size - 7); + } else { + escape_html(ob, link->data, link->size); + } + + BUFPUTSL(ob, ""); + + return 1; +} + +static void +rndr_blockcode(struct buf *ob, const struct buf *text, const struct buf *lang, void *opaque) +{ + if (ob->size) bufputc(ob, '\n'); + + if (lang && lang->size) { + size_t i, cls; + BUFPUTSL(ob, "
size; ++i, ++cls) {
+			while (i < lang->size && isspace(lang->data[i]))
+				i++;
+
+			if (i < lang->size) {
+				size_t org = i;
+				while (i < lang->size && !isspace(lang->data[i]))
+					i++;
+
+				if (lang->data[org] == '.')
+					org++;
+
+				if (cls) bufputc(ob, ' ');
+				escape_html(ob, lang->data + org, i - org);
+			}
+		}
+
+		BUFPUTSL(ob, "\">");
+	} else
+		BUFPUTSL(ob, "
");
+
+	if (text)
+		escape_html(ob, text->data, text->size);
+
+	BUFPUTSL(ob, "
\n"); +} + +static void +rndr_blockquote(struct buf *ob, const struct buf *text, void *opaque) +{ + if (ob->size) bufputc(ob, '\n'); + BUFPUTSL(ob, "
\n"); + if (text) bufput(ob, text->data, text->size); + BUFPUTSL(ob, "
\n"); +} + +static int +rndr_codespan(struct buf *ob, const struct buf *text, void *opaque) +{ + BUFPUTSL(ob, ""); + if (text) escape_html(ob, text->data, text->size); + BUFPUTSL(ob, ""); + return 1; +} + +static int +rndr_strikethrough(struct buf *ob, const struct buf *text, void *opaque) +{ + if (!text || !text->size) + return 0; + + BUFPUTSL(ob, ""); + bufput(ob, text->data, text->size); + BUFPUTSL(ob, ""); + return 1; +} + +static int +rndr_double_emphasis(struct buf *ob, const struct buf *text, void *opaque) +{ + if (!text || !text->size) + return 0; + + BUFPUTSL(ob, ""); + bufput(ob, text->data, text->size); + BUFPUTSL(ob, ""); + + return 1; +} + +static int +rndr_emphasis(struct buf *ob, const struct buf *text, void *opaque) +{ + if (!text || !text->size) return 0; + BUFPUTSL(ob, ""); + if (text) bufput(ob, text->data, text->size); + BUFPUTSL(ob, ""); + return 1; +} + +static int +rndr_linebreak(struct buf *ob, void *opaque) +{ + struct html_renderopt *options = opaque; + bufputs(ob, USE_XHTML(options) ? "
\n" : "
\n"); + return 1; +} + +static void +rndr_header(struct buf *ob, const struct buf *text, int level, void *opaque) +{ + struct html_renderopt *options = opaque; + + if (ob->size) + bufputc(ob, '\n'); + + if (options->flags & HTML_TOC) { + bufprintf(ob, "toc_id_prefix) { + bufputs(ob, options->toc_id_prefix); + } + bufprintf(ob, "toc_%d\">", options->toc_data.header_count++); + } else { + bufprintf(ob, "", level); + } + + if (text) bufput(ob, text->data, text->size); + bufprintf(ob, "\n", level); +} + +static int +rndr_link(struct buf *ob, const struct buf *link, const struct buf *title, const struct buf *content, void *opaque) +{ + struct html_renderopt *options = opaque; + + if (link != NULL && (options->flags & HTML_SAFELINK) != 0 && !sd_autolink_issafe(link->data, link->size)) + return 0; + + BUFPUTSL(ob, "size) + escape_href(ob, link->data, link->size); + + if (title && title->size) { + BUFPUTSL(ob, "\" title=\""); + escape_html(ob, title->data, title->size); + } + + if (options->link_attributes) { + bufputc(ob, '\"'); + options->link_attributes(ob, link, opaque); + bufputc(ob, '>'); + } else { + BUFPUTSL(ob, "\">"); + } + + if (content && content->size) bufput(ob, content->data, content->size); + BUFPUTSL(ob, ""); + return 1; +} + +static void +rndr_list(struct buf *ob, const struct buf *text, int flags, void *opaque) +{ + if (ob->size) bufputc(ob, '\n'); + bufput(ob, flags & MKD_LIST_ORDERED ? "
    \n" : "
      \n", 5); + if (text) bufput(ob, text->data, text->size); + bufput(ob, flags & MKD_LIST_ORDERED ? "
\n" : "\n", 6); +} + +static void +rndr_listitem(struct buf *ob, const struct buf *text, int flags, void *opaque) +{ + BUFPUTSL(ob, "
  • "); + if (text) { + size_t size = text->size; + while (size && text->data[size - 1] == '\n') + size--; + + bufput(ob, text->data, size); + } + BUFPUTSL(ob, "
  • \n"); +} + +static void +rndr_paragraph(struct buf *ob, const struct buf *text, void *opaque) +{ + struct html_renderopt *options = opaque; + size_t i = 0; + + if (ob->size) bufputc(ob, '\n'); + + if (!text || !text->size) + return; + + while (i < text->size && isspace(text->data[i])) i++; + + if (i == text->size) + return; + + BUFPUTSL(ob, "

    "); + if (options->flags & HTML_HARD_WRAP) { + size_t org; + while (i < text->size) { + org = i; + while (i < text->size && text->data[i] != '\n') + i++; + + if (i > org) + bufput(ob, text->data + org, i - org); + + /* + * do not insert a line break if this newline + * is the last character on the paragraph + */ + if (i >= text->size - 1) + break; + + rndr_linebreak(ob, opaque); + i++; + } + } else { + bufput(ob, &text->data[i], text->size - i); + } + BUFPUTSL(ob, "

    \n"); +} + +static void +rndr_raw_block(struct buf *ob, const struct buf *text, void *opaque) +{ + size_t org, sz; + if (!text) return; + sz = text->size; + while (sz > 0 && text->data[sz - 1] == '\n') sz--; + org = 0; + while (org < sz && text->data[org] == '\n') org++; + if (org >= sz) return; + if (ob->size) bufputc(ob, '\n'); + bufput(ob, text->data + org, sz - org); + bufputc(ob, '\n'); +} + +static int +rndr_triple_emphasis(struct buf *ob, const struct buf *text, void *opaque) +{ + if (!text || !text->size) return 0; + BUFPUTSL(ob, ""); + bufput(ob, text->data, text->size); + BUFPUTSL(ob, ""); + return 1; +} + +static void +rndr_hrule(struct buf *ob, void *opaque) +{ + struct html_renderopt *options = opaque; + if (ob->size) bufputc(ob, '\n'); + bufputs(ob, USE_XHTML(options) ? "
    \n" : "
    \n"); +} + +static int +rndr_image(struct buf *ob, const struct buf *link, const struct buf *title, const struct buf *alt, void *opaque) +{ + struct html_renderopt *options = opaque; + if (!link || !link->size) return 0; + + BUFPUTSL(ob, "data, link->size); + BUFPUTSL(ob, "\" alt=\""); + + if (alt && alt->size) + escape_html(ob, alt->data, alt->size); + + if (title && title->size) { + BUFPUTSL(ob, "\" title=\""); + escape_html(ob, title->data, title->size); } + + bufputs(ob, USE_XHTML(options) ? "\"/>" : "\">"); + return 1; +} + +static void +rndr_html_tag(struct buf *ob, const struct buf *text, void *opaque, + char* tagname, char** whitelist, int tagtype) +{ + size_t i, x, z, in_str = 0, seen_equals = 0, done = 0, done_attr = 0, reset = 0; + struct buf *attr; + struct buf *value; + char c; + + bufputc(ob, '<'); + + if(tagtype == HTML_TAG_CLOSE) { + bufputc(ob, '/'); + bufputs(ob, tagname); + bufputc(ob, '>'); + return; + } + + bufputs(ob, tagname); + i = 1 + strlen(tagname); + + attr = bufnew(16); + value = bufnew(16); + + for(; i < text->size && !done; i++) { + c = text->data[i]; + done = 0; + reset = 0; + done_attr = 0; + + switch(c) { + case '>': + done = 1; + break; + case '\'': + case '"': + if(!seen_equals) { + reset = 1; + } else if(!in_str) { + in_str = c; + } else if(in_str == c) { + in_str = 0; + done_attr = 1; + } else { + bufputc(value, c); + } + break; + case ' ': + if (in_str) { + bufputc(value, ' '); + } else { + reset = 1; + } + break; + case '=': + if(seen_equals) { + reset = 1; + break; + } + seen_equals = 1; + break; + default: + if(seen_equals && in_str || !seen_equals) { + bufputc(seen_equals ? value : attr, c); + } + break; + } + + if(done_attr) { + int valid = 0; + for(z = 0; whitelist[z]; z++) { + if(strlen(whitelist[z]) != attr->size) { + continue; + } + for(x = 0; x < attr->size; x++) { + if(tolower(whitelist[z][x]) != tolower(attr->data[x])) { + break; + } + } + if(x == attr->size) { + valid = 1; + break; + } + } + if(valid && value->size && attr->size) { + bufputc(ob, ' '); + escape_html(ob, attr->data, attr->size); + bufputs(ob, "=\""); + escape_html(ob, value->data, value->size); + bufputc(ob, '"'); + } + reset = 1; + } + + if(reset) { + seen_equals = 0; + in_str = 0; + bufreset(attr); + bufreset(value); + } + } + + bufrelease(attr); + bufrelease(value); + + bufputc(ob, '>'); +} + +static int +rndr_raw_html(struct buf *ob, const struct buf *text, void *opaque) +{ + struct html_renderopt *options = opaque; + char** whitelist = options->html_element_whitelist; + int i, tagtype; + + /* Items on the whitelist ignore all other flags and just output */ + if (((options->flags & HTML_ALLOW_ELEMENT_WHITELIST) != 0) && whitelist) { + for (i = 0; whitelist[i]; i++) { + tagtype = sdhtml_is_tag(text->data, text->size, whitelist[i]); + if (tagtype != HTML_TAG_NONE) { + rndr_html_tag(ob, text, opaque, + whitelist[i], + options->html_attr_whitelist, + tagtype); + return 1; + } + } + } + + /* HTML_ESCAPE overrides SKIP_HTML, SKIP_STYLE, SKIP_LINKS and SKIP_IMAGES + * It doens't see if there are any valid tags, just escape all of them. */ + if((options->flags & HTML_ESCAPE) != 0) { + escape_html(ob, text->data, text->size); + return 1; + } + + if ((options->flags & HTML_SKIP_HTML) != 0) + return 1; + + if ((options->flags & HTML_SKIP_STYLE) != 0 && + sdhtml_is_tag(text->data, text->size, "style")) + return 1; + + if ((options->flags & HTML_SKIP_LINKS) != 0 && + sdhtml_is_tag(text->data, text->size, "a")) + return 1; + + if ((options->flags & HTML_SKIP_IMAGES) != 0 && + sdhtml_is_tag(text->data, text->size, "img")) + return 1; + + bufput(ob, text->data, text->size); + return 1; +} + +static void +rndr_table(struct buf *ob, const struct buf *header, const struct buf *body, void *opaque) +{ + if (ob->size) bufputc(ob, '\n'); + BUFPUTSL(ob, "\n"); + if (header) + bufput(ob, header->data, header->size); + BUFPUTSL(ob, "\n"); + if (body) + bufput(ob, body->data, body->size); + BUFPUTSL(ob, "
    \n"); +} + +static void +rndr_tablerow(struct buf *ob, const struct buf *text, void *opaque) +{ + BUFPUTSL(ob, "\n"); + if (text) + bufput(ob, text->data, text->size); + BUFPUTSL(ob, "\n"); +} + +static void +rndr_tablecell(struct buf *ob, const struct buf *text, int flags, void *opaque, int col_span) +{ + if (flags & MKD_TABLE_HEADER) { + BUFPUTSL(ob, " 1) { + bufprintf(ob, " colspan=\"%d\" ", col_span); + } + + switch (flags & MKD_TABLE_ALIGNMASK) { + case MKD_TABLE_ALIGN_CENTER: + BUFPUTSL(ob, " align=\"center\">"); + break; + + case MKD_TABLE_ALIGN_L: + BUFPUTSL(ob, " align=\"left\">"); + break; + + case MKD_TABLE_ALIGN_R: + BUFPUTSL(ob, " align=\"right\">"); + break; + + default: + BUFPUTSL(ob, ">"); + } + + if (text) + bufput(ob, text->data, text->size); + + if (flags & MKD_TABLE_HEADER) { + BUFPUTSL(ob, "\n"); + } else { + BUFPUTSL(ob, "\n"); + } +} + +static int +rndr_superscript(struct buf *ob, const struct buf *text, void *opaque) +{ + if (!text || !text->size) return 0; + BUFPUTSL(ob, ""); + bufput(ob, text->data, text->size); + BUFPUTSL(ob, ""); + return 1; +} + +static void +rndr_normal_text(struct buf *ob, const struct buf *text, void *opaque) +{ + if (text) + escape_html(ob, text->data, text->size); +} + +static void +toc_header(struct buf *ob, const struct buf *text, int level, void *opaque) +{ + struct html_renderopt *options = opaque; + + /* set the level offset if this is the first header + * we're parsing for the document */ + if (options->toc_data.current_level == 0) { + BUFPUTSL(ob, "
    \n"); + options->toc_data.level_offset = level - 1; + } + level -= options->toc_data.level_offset; + + if (level > options->toc_data.current_level) { + while (level > options->toc_data.current_level) { + BUFPUTSL(ob, "
      \n
    • \n"); + options->toc_data.current_level++; + } + } else if (level < options->toc_data.current_level) { + BUFPUTSL(ob, "
    • \n"); + while (level < options->toc_data.current_level) { + BUFPUTSL(ob, "
    \n\n"); + options->toc_data.current_level--; + } + BUFPUTSL(ob,"
  • \n"); + } else { + BUFPUTSL(ob,"
  • \n
  • \n"); + } + + BUFPUTSL(ob, "toc_id_prefix) { + bufputs(ob, options->toc_id_prefix); + } + + bufprintf(ob, "toc_%d\">", options->toc_data.header_count++); + if (text) + escape_html(ob, text->data, text->size); + BUFPUTSL(ob, "\n"); +} + +static int +toc_link(struct buf *ob, const struct buf *link, const struct buf *title, const struct buf *content, void *opaque) +{ + if (content && content->size) + bufput(ob, content->data, content->size); + return 1; +} + +static void +reset_toc(struct buf *ob, void *opaque) +{ + struct html_renderopt *options = opaque; + + memset(&(options->toc_data), 0, sizeof(options->toc_data)); +} + +static void +toc_finalize(struct buf *ob, void *opaque) +{ + struct html_renderopt *options = opaque; + bool has_toc = false; + while (options->toc_data.current_level > 0) { + BUFPUTSL(ob, "
  • \n\n"); + options->toc_data.current_level--; + has_toc = true; + } + if(has_toc) { + BUFPUTSL(ob, "
    \n"); + } + reset_toc(ob, opaque); +} + +void +sdhtml_toc_renderer(struct sd_callbacks *callbacks, struct html_renderopt *options) +{ + static const struct sd_callbacks cb_default = { + NULL, + NULL, + NULL, + toc_header, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + + NULL, + rndr_codespan, + rndr_double_emphasis, + rndr_emphasis, + NULL, + NULL, + toc_link, + NULL, + rndr_triple_emphasis, + rndr_strikethrough, + rndr_superscript, + + NULL, + NULL, + + NULL, + toc_finalize, + }; + + memset(options, 0x0, sizeof(struct html_renderopt)); + options->flags = HTML_TOC | HTML_SKIP_HTML; + + memcpy(callbacks, &cb_default, sizeof(struct sd_callbacks)); +} + +void +sdhtml_renderer(struct sd_callbacks *callbacks, struct html_renderopt *options, unsigned int render_flags) +{ + static const struct sd_callbacks cb_default = { + rndr_blockcode, + rndr_blockquote, + rndr_raw_block, + rndr_header, + rndr_hrule, + rndr_list, + rndr_listitem, + rndr_paragraph, + rndr_table, + rndr_tablerow, + rndr_tablecell, + + rndr_autolink, + rndr_codespan, + rndr_double_emphasis, + rndr_emphasis, + rndr_image, + rndr_linebreak, + rndr_link, + rndr_raw_html, + rndr_triple_emphasis, + rndr_strikethrough, + rndr_superscript, + + NULL, + rndr_normal_text, + + NULL, + reset_toc, + }; + + /* Prepare the options pointer */ + memset(options, 0x0, sizeof(struct html_renderopt)); + options->flags = render_flags; + + /* Prepare the callbacks */ + memcpy(callbacks, &cb_default, sizeof(struct sd_callbacks)); + + if (render_flags & HTML_SKIP_IMAGES) + callbacks->image = NULL; + + if (render_flags & HTML_SKIP_LINKS) { + callbacks->link = NULL; + callbacks->autolink = NULL; + } + + if (render_flags & HTML_SKIP_HTML || render_flags & HTML_ESCAPE) + callbacks->blockhtml = NULL; +} diff --git a/SnudownTest/html.h b/SnudownTest/html.h new file mode 100644 index 0000000..59103b3 --- /dev/null +++ b/SnudownTest/html.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2011, Vicent Marti + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef UPSKIRT_HTML_H +#define UPSKIRT_HTML_H + +#include "markdown.h" +#include "buffer.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct html_renderopt { + struct { + int header_count; + int current_level; + int level_offset; + } toc_data; + + char* toc_id_prefix; + + unsigned int flags; + + char** html_element_whitelist; + char** html_attr_whitelist; + + /* extra callbacks */ + void (*link_attributes)(struct buf *ob, const struct buf *url, void *self); +}; + +typedef enum { + HTML_SKIP_HTML = (1 << 0), + HTML_SKIP_STYLE = (1 << 1), + HTML_SKIP_IMAGES = (1 << 2), + HTML_SKIP_LINKS = (1 << 3), + HTML_EXPAND_TABS = (1 << 4), + HTML_SAFELINK = (1 << 5), + HTML_TOC = (1 << 6), + HTML_HARD_WRAP = (1 << 7), + HTML_USE_XHTML = (1 << 8), + HTML_ESCAPE = (1 << 9), + HTML_ALLOW_ELEMENT_WHITELIST = (1 << 10), +} html_render_mode; + +typedef enum { + HTML_TAG_NONE = 0, + HTML_TAG_OPEN, + HTML_TAG_CLOSE, +} html_tag; + +int +sdhtml_is_tag(const uint8_t *tag_data, size_t tag_size, const char *tagname); + +extern void +sdhtml_renderer(struct sd_callbacks *callbacks, struct html_renderopt *options_ptr, unsigned int render_flags); + +extern void +sdhtml_toc_renderer(struct sd_callbacks *callbacks, struct html_renderopt *options_ptr); + +extern void +sdhtml_smartypants(struct buf *ob, const uint8_t *text, size_t size); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/SnudownTest/html/houdini.h b/SnudownTest/html/houdini.h new file mode 100644 index 0000000..b4954c0 --- /dev/null +++ b/SnudownTest/html/houdini.h @@ -0,0 +1,37 @@ +#ifndef HOUDINI_H__ +#define HOUDINI_H__ + +#include "buffer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef HOUDINI_USE_LOCALE +# define _isxdigit(c) isxdigit(c) +# define _isdigit(c) isdigit(c) +#else +/* + * Helper _isdigit methods -- do not trust the current locale + * */ +# define _isxdigit(c) (strchr("0123456789ABCDEFabcdef", (c)) != NULL) +# define _isdigit(c) ((c) >= '0' && (c) <= '9') +#endif + +extern void houdini_escape_html(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_escape_html0(struct buf *ob, const uint8_t *src, size_t size, int secure); +extern void houdini_unescape_html(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_escape_xml(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_escape_uri(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_escape_url(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_escape_href(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_unescape_uri(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_unescape_url(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_escape_js(struct buf *ob, const uint8_t *src, size_t size); +extern void houdini_unescape_js(struct buf *ob, const uint8_t *src, size_t size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/SnudownTest/html/houdini_href_e.c b/SnudownTest/html/houdini_href_e.c new file mode 100644 index 0000000..581df1f --- /dev/null +++ b/SnudownTest/html/houdini_href_e.c @@ -0,0 +1,116 @@ +#include +#include +#include + +#include "houdini.h" + +#define ESCAPE_GROW_FACTOR(x) (((x) * 12) / 10) + +/* + * The following characters will not be escaped: + * + * -_.+!*'(),%#@?=;:/,+&$ alphanum + * + * Note that this character set is the addition of: + * + * - The characters which are safe to be in an URL + * - The characters which are *not* safe to be in + * an URL because they are RESERVED characters. + * + * We asume (lazily) that any RESERVED char that + * appears inside an URL is actually meant to + * have its native function (i.e. as an URL + * component/separator) and hence needs no escaping. + * + * There are two exceptions: the chacters & (amp) + * and ' (single quote) do not appear in the table. + * They are meant to appear in the URL as components, + * yet they require special HTML-entity escaping + * to generate valid HTML markup. + * + * All other characters will be escaped to %XX. + * + */ +static const char HREF_SAFE[] = { + 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +void +houdini_escape_href(struct buf *ob, const uint8_t *src, size_t size) +{ + static const char hex_chars[] = "0123456789ABCDEF"; + size_t i = 0, org; + char hex_str[3]; + + bufgrow(ob, ESCAPE_GROW_FACTOR(size)); + hex_str[0] = '%'; + + while (i < size) { + org = i; + /* Skip by characters that don't need special + * processing */ + while (i < size && HREF_SAFE[src[i]] == 1) + i++; + + if (i > org) + bufput(ob, src + org, i - org); + + /* escaping */ + if (i >= size) + break; + + /* throw out control characters */ + if (HREF_SAFE[src[i]] == 2) { + i++; + continue; + } + + switch (src[i]) { + /* amp appears all the time in URLs, but needs + * HTML-entity escaping to be inside an href */ + case '&': + BUFPUTSL(ob, "&"); + break; + + /* the single quote is a valid URL character + * according to the standard; it needs HTML + * entity escaping too */ + case '\'': + BUFPUTSL(ob, "'"); + break; + + /* the space can be escaped to %20 or a plus + * sign. we're going with the generic escape + * for now. the plus thing is more commonly seen + * when building GET strings */ +#if 0 + case ' ': + bufputc(ob, '+'); + break; +#endif + + /* every other character goes with a %XX escaping */ + default: + hex_str[1] = hex_chars[(src[i] >> 4) & 0xF]; + hex_str[2] = hex_chars[src[i] & 0xF]; + bufput(ob, hex_str, 3); + } + + i++; + } +} diff --git a/SnudownTest/html/houdini_html_e.c b/SnudownTest/html/houdini_html_e.c new file mode 100644 index 0000000..085c4bf --- /dev/null +++ b/SnudownTest/html/houdini_html_e.c @@ -0,0 +1,87 @@ +#include +#include +#include + +#include "houdini.h" + +#define ESCAPE_GROW_FACTOR(x) (((x) * 12) / 10) /* this is very scientific, yes */ + +/** + * According to the OWASP rules: + * + * & --> & + * < --> < + * > --> > + * " --> " + * ' --> ' ' is not recommended + * / --> / forward slash is included as it helps end an HTML entity + * + */ +static const char HTML_ESCAPE_TABLE[] = { + 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 7, 7, 0, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 0, 0, 1, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 6, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static const char *HTML_ESCAPES[] = { + "", + """, + "&", + "'", + "/", + "<", + ">", + "", // throw out control characters +}; + +void +houdini_escape_html0(struct buf *ob, const uint8_t *src, size_t size, int secure) +{ + size_t i = 0, org, esc = 0; + + bufgrow(ob, ESCAPE_GROW_FACTOR(size)); + + while (i < size) { + org = i; + while (i < size && (esc = HTML_ESCAPE_TABLE[src[i]]) == 0) + i++; + + if (i > org) + bufput(ob, src + org, i - org); + + /* escaping */ + if (i >= size) + break; + + /* The forward slash is only escaped in secure mode */ + if (src[i] == '/' && !secure) { + bufputc(ob, '/'); + } else if (HTML_ESCAPE_TABLE[src[i]] == 7) { + /* skip control characters */ + } else { + bufputs(ob, HTML_ESCAPES[esc]); + } + + i++; + } +} + +void +houdini_escape_html(struct buf *ob, const uint8_t *src, size_t size) +{ + houdini_escape_html0(ob, src, size, 1); +} + diff --git a/SnudownTest/html/html.c b/SnudownTest/html/html.c new file mode 100644 index 0000000..eebccc1 --- /dev/null +++ b/SnudownTest/html/html.c @@ -0,0 +1,790 @@ +/* + * Copyright (c) 2009, Natacha Porté + * Copyright (c) 2011, Vicent Marti + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "markdown.h" +#include "html.h" + +#include +#include +#include +#include +#include + +#include "houdini.h" + +#define USE_XHTML(opt) (opt->flags & HTML_USE_XHTML) + +int +sdhtml_is_tag(const uint8_t *tag_data, size_t tag_size, const char *tagname) +{ + size_t i; + int closed = 0; + + if (tag_size < 3 || tag_data[0] != '<') + return HTML_TAG_NONE; + + i = 1; + + if (tag_data[i] == '/') { + closed = 1; + i++; + } + + for (; i < tag_size; ++i, ++tagname) { + if (*tagname == 0) + break; + + if (tag_data[i] != *tagname) + return HTML_TAG_NONE; + } + + if (i == tag_size) + return HTML_TAG_NONE; + + if (isspace(tag_data[i]) || tag_data[i] == '>') + return closed ? HTML_TAG_CLOSE : HTML_TAG_OPEN; + + return HTML_TAG_NONE; +} + +static inline void escape_html(struct buf *ob, const uint8_t *source, size_t length) +{ + houdini_escape_html0(ob, source, length, 0); +} + +static inline void escape_href(struct buf *ob, const uint8_t *source, size_t length) +{ + houdini_escape_href(ob, source, length); +} + +/******************** + * GENERIC RENDERER * + ********************/ +static int +rndr_autolink(struct buf *ob, const struct buf *link, enum mkd_autolink type, void *opaque) +{ + struct html_renderopt *options = opaque; + uint8_t offset = 0; + + if (!link || !link->size) + return 0; + + if ((options->flags & HTML_SAFELINK) != 0 && + !sd_autolink_issafe(link->data, link->size) && + type != MKDA_EMAIL) + return 0; + + BUFPUTSL(ob, "data + offset, link->size - offset); + + if (options->link_attributes) { + bufputc(ob, '\"'); + options->link_attributes(ob, link, opaque); + bufputc(ob, '>'); + } else { + BUFPUTSL(ob, "\">"); + } + + /* + * Pretty printing: if we get an email address as + * an actual URI, e.g. `mailto:foo@bar.com`, we don't + * want to print the `mailto:` prefix + */ + if (bufprefix(link, "mailto:") == 0) { + escape_html(ob, link->data + 7, link->size - 7); + } else { + escape_html(ob, link->data, link->size); + } + + BUFPUTSL(ob, ""); + + return 1; +} + +static void +rndr_blockcode(struct buf *ob, const struct buf *text, const struct buf *lang, void *opaque) +{ + if (ob->size) bufputc(ob, '\n'); + + if (lang && lang->size) { + size_t i, cls; + BUFPUTSL(ob, "
    size; ++i, ++cls) {
    +			while (i < lang->size && isspace(lang->data[i]))
    +				i++;
    +
    +			if (i < lang->size) {
    +				size_t org = i;
    +				while (i < lang->size && !isspace(lang->data[i]))
    +					i++;
    +
    +				if (lang->data[org] == '.')
    +					org++;
    +
    +				if (cls) bufputc(ob, ' ');
    +				escape_html(ob, lang->data + org, i - org);
    +			}
    +		}
    +
    +		BUFPUTSL(ob, "\">");
    +	} else
    +		BUFPUTSL(ob, "
    ");
    +
    +	if (text)
    +		escape_html(ob, text->data, text->size);
    +
    +	BUFPUTSL(ob, "
    \n"); +} + +static void +rndr_blockquote(struct buf *ob, const struct buf *text, void *opaque) +{ + if (ob->size) bufputc(ob, '\n'); + BUFPUTSL(ob, "
    \n"); + if (text) bufput(ob, text->data, text->size); + BUFPUTSL(ob, "
    \n"); +} + +static int +rndr_codespan(struct buf *ob, const struct buf *text, void *opaque) +{ + BUFPUTSL(ob, ""); + if (text) escape_html(ob, text->data, text->size); + BUFPUTSL(ob, ""); + return 1; +} + +static int +rndr_strikethrough(struct buf *ob, const struct buf *text, void *opaque) +{ + if (!text || !text->size) + return 0; + + BUFPUTSL(ob, ""); + bufput(ob, text->data, text->size); + BUFPUTSL(ob, ""); + return 1; +} + +static int +rndr_double_emphasis(struct buf *ob, const struct buf *text, void *opaque) +{ + if (!text || !text->size) + return 0; + + BUFPUTSL(ob, ""); + bufput(ob, text->data, text->size); + BUFPUTSL(ob, ""); + + return 1; +} + +static int +rndr_emphasis(struct buf *ob, const struct buf *text, void *opaque) +{ + if (!text || !text->size) return 0; + BUFPUTSL(ob, ""); + if (text) bufput(ob, text->data, text->size); + BUFPUTSL(ob, ""); + return 1; +} + +static int +rndr_linebreak(struct buf *ob, void *opaque) +{ + struct html_renderopt *options = opaque; + bufputs(ob, USE_XHTML(options) ? "
    \n" : "
    \n"); + return 1; +} + +static void +rndr_header(struct buf *ob, const struct buf *text, int level, void *opaque) +{ + struct html_renderopt *options = opaque; + + if (ob->size) + bufputc(ob, '\n'); + + if (options->flags & HTML_TOC) { + bufprintf(ob, "toc_id_prefix) { + bufputs(ob, options->toc_id_prefix); + } + bufprintf(ob, "toc_%d\">", options->toc_data.header_count++); + } else { + bufprintf(ob, "", level); + } + + if (text) bufput(ob, text->data, text->size); + bufprintf(ob, "\n", level); +} + +static int +rndr_link(struct buf *ob, const struct buf *link, const struct buf *title, const struct buf *content, void *opaque) +{ + struct html_renderopt *options = opaque; + + if (link != NULL && (options->flags & HTML_SAFELINK) != 0 && !sd_autolink_issafe(link->data, link->size)) + return 0; + + BUFPUTSL(ob, "size) + escape_href(ob, link->data, link->size); + + if (title && title->size) { + BUFPUTSL(ob, "\" title=\""); + escape_html(ob, title->data, title->size); + } + + if (options->link_attributes) { + bufputc(ob, '\"'); + options->link_attributes(ob, link, opaque); + bufputc(ob, '>'); + } else { + BUFPUTSL(ob, "\">"); + } + + if (content && content->size) bufput(ob, content->data, content->size); + BUFPUTSL(ob, ""); + return 1; +} + +static void +rndr_list(struct buf *ob, const struct buf *text, int flags, void *opaque) +{ + if (ob->size) bufputc(ob, '\n'); + bufput(ob, flags & MKD_LIST_ORDERED ? "
      \n" : "
        \n", 5); + if (text) bufput(ob, text->data, text->size); + bufput(ob, flags & MKD_LIST_ORDERED ? "
    \n" : "\n", 6); +} + +static void +rndr_listitem(struct buf *ob, const struct buf *text, int flags, void *opaque) +{ + BUFPUTSL(ob, "
  • "); + if (text) { + size_t size = text->size; + while (size && text->data[size - 1] == '\n') + size--; + + bufput(ob, text->data, size); + } + BUFPUTSL(ob, "
  • \n"); +} + +static void +rndr_paragraph(struct buf *ob, const struct buf *text, void *opaque) +{ + struct html_renderopt *options = opaque; + size_t i = 0; + + if (ob->size) bufputc(ob, '\n'); + + if (!text || !text->size) + return; + + while (i < text->size && isspace(text->data[i])) i++; + + if (i == text->size) + return; + + BUFPUTSL(ob, "

    "); + if (options->flags & HTML_HARD_WRAP) { + size_t org; + while (i < text->size) { + org = i; + while (i < text->size && text->data[i] != '\n') + i++; + + if (i > org) + bufput(ob, text->data + org, i - org); + + /* + * do not insert a line break if this newline + * is the last character on the paragraph + */ + if (i >= text->size - 1) + break; + + rndr_linebreak(ob, opaque); + i++; + } + } else { + bufput(ob, &text->data[i], text->size - i); + } + BUFPUTSL(ob, "

    \n"); +} + +static void +rndr_raw_block(struct buf *ob, const struct buf *text, void *opaque) +{ + size_t org, sz; + if (!text) return; + sz = text->size; + while (sz > 0 && text->data[sz - 1] == '\n') sz--; + org = 0; + while (org < sz && text->data[org] == '\n') org++; + if (org >= sz) return; + if (ob->size) bufputc(ob, '\n'); + bufput(ob, text->data + org, sz - org); + bufputc(ob, '\n'); +} + +static int +rndr_triple_emphasis(struct buf *ob, const struct buf *text, void *opaque) +{ + if (!text || !text->size) return 0; + BUFPUTSL(ob, ""); + bufput(ob, text->data, text->size); + BUFPUTSL(ob, ""); + return 1; +} + +static void +rndr_hrule(struct buf *ob, void *opaque) +{ + struct html_renderopt *options = opaque; + if (ob->size) bufputc(ob, '\n'); + bufputs(ob, USE_XHTML(options) ? "
    \n" : "
    \n"); +} + +static int +rndr_image(struct buf *ob, const struct buf *link, const struct buf *title, const struct buf *alt, void *opaque) +{ + struct html_renderopt *options = opaque; + if (!link || !link->size) return 0; + + BUFPUTSL(ob, "data, link->size); + BUFPUTSL(ob, "\" alt=\""); + + if (alt && alt->size) + escape_html(ob, alt->data, alt->size); + + if (title && title->size) { + BUFPUTSL(ob, "\" title=\""); + escape_html(ob, title->data, title->size); } + + bufputs(ob, USE_XHTML(options) ? "\"/>" : "\">"); + return 1; +} + +static void +rndr_html_tag(struct buf *ob, const struct buf *text, void *opaque, + char* tagname, char** whitelist, int tagtype) +{ + size_t i, x, z, in_str = 0, seen_equals = 0, done = 0, done_attr = 0, reset = 0; + struct buf *attr; + struct buf *value; + char c; + + bufputc(ob, '<'); + + if(tagtype == HTML_TAG_CLOSE) { + bufputc(ob, '/'); + bufputs(ob, tagname); + bufputc(ob, '>'); + return; + } + + bufputs(ob, tagname); + i = 1 + strlen(tagname); + + attr = bufnew(16); + value = bufnew(16); + + for(; i < text->size && !done; i++) { + c = text->data[i]; + done = 0; + reset = 0; + done_attr = 0; + + switch(c) { + case '>': + done = 1; + break; + case '\'': + case '"': + if(!seen_equals) { + reset = 1; + } else if(!in_str) { + in_str = c; + } else if(in_str == c) { + in_str = 0; + done_attr = 1; + } else { + bufputc(value, c); + } + break; + case ' ': + if (in_str) { + bufputc(value, ' '); + } else { + reset = 1; + } + break; + case '=': + if(seen_equals) { + reset = 1; + break; + } + seen_equals = 1; + break; + default: + if(seen_equals && in_str || !seen_equals) { + bufputc(seen_equals ? value : attr, c); + } + break; + } + + if(done_attr) { + int valid = 0; + for(z = 0; whitelist[z]; z++) { + if(strlen(whitelist[z]) != attr->size) { + continue; + } + for(x = 0; x < attr->size; x++) { + if(tolower(whitelist[z][x]) != tolower(attr->data[x])) { + break; + } + } + if(x == attr->size) { + valid = 1; + break; + } + } + if(valid && value->size && attr->size) { + bufputc(ob, ' '); + escape_html(ob, attr->data, attr->size); + bufputs(ob, "=\""); + escape_html(ob, value->data, value->size); + bufputc(ob, '"'); + } + reset = 1; + } + + if(reset) { + seen_equals = 0; + in_str = 0; + bufreset(attr); + bufreset(value); + } + } + + bufrelease(attr); + bufrelease(value); + + bufputc(ob, '>'); +} + +static int +rndr_raw_html(struct buf *ob, const struct buf *text, void *opaque) +{ + struct html_renderopt *options = opaque; + char** whitelist = options->html_element_whitelist; + int i, tagtype; + + /* Items on the whitelist ignore all other flags and just output */ + if (((options->flags & HTML_ALLOW_ELEMENT_WHITELIST) != 0) && whitelist) { + for (i = 0; whitelist[i]; i++) { + tagtype = sdhtml_is_tag(text->data, text->size, whitelist[i]); + if (tagtype != HTML_TAG_NONE) { + rndr_html_tag(ob, text, opaque, + whitelist[i], + options->html_attr_whitelist, + tagtype); + return 1; + } + } + } + + /* HTML_ESCAPE overrides SKIP_HTML, SKIP_STYLE, SKIP_LINKS and SKIP_IMAGES + * It doens't see if there are any valid tags, just escape all of them. */ + if((options->flags & HTML_ESCAPE) != 0) { + escape_html(ob, text->data, text->size); + return 1; + } + + if ((options->flags & HTML_SKIP_HTML) != 0) + return 1; + + if ((options->flags & HTML_SKIP_STYLE) != 0 && + sdhtml_is_tag(text->data, text->size, "style")) + return 1; + + if ((options->flags & HTML_SKIP_LINKS) != 0 && + sdhtml_is_tag(text->data, text->size, "a")) + return 1; + + if ((options->flags & HTML_SKIP_IMAGES) != 0 && + sdhtml_is_tag(text->data, text->size, "img")) + return 1; + + bufput(ob, text->data, text->size); + return 1; +} + +static void +rndr_table(struct buf *ob, const struct buf *header, const struct buf *body, void *opaque) +{ + if (ob->size) bufputc(ob, '\n'); + BUFPUTSL(ob, "\n"); + if (header) + bufput(ob, header->data, header->size); + BUFPUTSL(ob, "\n"); + if (body) + bufput(ob, body->data, body->size); + BUFPUTSL(ob, "
    \n"); +} + +static void +rndr_tablerow(struct buf *ob, const struct buf *text, void *opaque) +{ + BUFPUTSL(ob, "\n"); + if (text) + bufput(ob, text->data, text->size); + BUFPUTSL(ob, "\n"); +} + +static void +rndr_tablecell(struct buf *ob, const struct buf *text, int flags, void *opaque, int col_span) +{ + if (flags & MKD_TABLE_HEADER) { + BUFPUTSL(ob, " 1) { + bufprintf(ob, " colspan=\"%d\" ", col_span); + } + + switch (flags & MKD_TABLE_ALIGNMASK) { + case MKD_TABLE_ALIGN_CENTER: + BUFPUTSL(ob, " align=\"center\">"); + break; + + case MKD_TABLE_ALIGN_L: + BUFPUTSL(ob, " align=\"left\">"); + break; + + case MKD_TABLE_ALIGN_R: + BUFPUTSL(ob, " align=\"right\">"); + break; + + default: + BUFPUTSL(ob, ">"); + } + + if (text) + bufput(ob, text->data, text->size); + + if (flags & MKD_TABLE_HEADER) { + BUFPUTSL(ob, "\n"); + } else { + BUFPUTSL(ob, "\n"); + } +} + +static int +rndr_superscript(struct buf *ob, const struct buf *text, void *opaque) +{ + if (!text || !text->size) return 0; + BUFPUTSL(ob, ""); + bufput(ob, text->data, text->size); + BUFPUTSL(ob, ""); + return 1; +} + +static void +rndr_normal_text(struct buf *ob, const struct buf *text, void *opaque) +{ + if (text) + escape_html(ob, text->data, text->size); +} + +static void +toc_header(struct buf *ob, const struct buf *text, int level, void *opaque) +{ + struct html_renderopt *options = opaque; + + /* set the level offset if this is the first header + * we're parsing for the document */ + if (options->toc_data.current_level == 0) { + BUFPUTSL(ob, "
    \n"); + options->toc_data.level_offset = level - 1; + } + level -= options->toc_data.level_offset; + + if (level > options->toc_data.current_level) { + while (level > options->toc_data.current_level) { + BUFPUTSL(ob, "
      \n
    • \n"); + options->toc_data.current_level++; + } + } else if (level < options->toc_data.current_level) { + BUFPUTSL(ob, "
    • \n"); + while (level < options->toc_data.current_level) { + BUFPUTSL(ob, "
    \n\n"); + options->toc_data.current_level--; + } + BUFPUTSL(ob,"
  • \n"); + } else { + BUFPUTSL(ob,"
  • \n
  • \n"); + } + + BUFPUTSL(ob, "toc_id_prefix) { + bufputs(ob, options->toc_id_prefix); + } + + bufprintf(ob, "toc_%d\">", options->toc_data.header_count++); + if (text) + escape_html(ob, text->data, text->size); + BUFPUTSL(ob, "\n"); +} + +static int +toc_link(struct buf *ob, const struct buf *link, const struct buf *title, const struct buf *content, void *opaque) +{ + if (content && content->size) + bufput(ob, content->data, content->size); + return 1; +} + +static void +reset_toc(struct buf *ob, void *opaque) +{ + struct html_renderopt *options = opaque; + + memset(&(options->toc_data), 0, sizeof(options->toc_data)); +} + +static void +toc_finalize(struct buf *ob, void *opaque) +{ + struct html_renderopt *options = opaque; + bool has_toc = false; + while (options->toc_data.current_level > 0) { + BUFPUTSL(ob, "
  • \n\n"); + options->toc_data.current_level--; + has_toc = true; + } + if(has_toc) { + BUFPUTSL(ob, "
    \n"); + } + reset_toc(ob, opaque); +} + +void +sdhtml_toc_renderer(struct sd_callbacks *callbacks, struct html_renderopt *options) +{ + static const struct sd_callbacks cb_default = { + NULL, + NULL, + NULL, + toc_header, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + + NULL, + rndr_codespan, + rndr_double_emphasis, + rndr_emphasis, + NULL, + NULL, + toc_link, + NULL, + rndr_triple_emphasis, + rndr_strikethrough, + rndr_superscript, + + NULL, + NULL, + + NULL, + toc_finalize, + }; + + memset(options, 0x0, sizeof(struct html_renderopt)); + options->flags = HTML_TOC | HTML_SKIP_HTML; + + memcpy(callbacks, &cb_default, sizeof(struct sd_callbacks)); +} + +void +sdhtml_renderer(struct sd_callbacks *callbacks, struct html_renderopt *options, unsigned int render_flags) +{ + static const struct sd_callbacks cb_default = { + rndr_blockcode, + rndr_blockquote, + rndr_raw_block, + rndr_header, + rndr_hrule, + rndr_list, + rndr_listitem, + rndr_paragraph, + rndr_table, + rndr_tablerow, + rndr_tablecell, + + rndr_autolink, + rndr_codespan, + rndr_double_emphasis, + rndr_emphasis, + rndr_image, + rndr_linebreak, + rndr_link, + rndr_raw_html, + rndr_triple_emphasis, + rndr_strikethrough, + rndr_superscript, + + NULL, + rndr_normal_text, + + NULL, + reset_toc, + }; + + /* Prepare the options pointer */ + memset(options, 0x0, sizeof(struct html_renderopt)); + options->flags = render_flags; + + /* Prepare the callbacks */ + memcpy(callbacks, &cb_default, sizeof(struct sd_callbacks)); + + if (render_flags & HTML_SKIP_IMAGES) + callbacks->image = NULL; + + if (render_flags & HTML_SKIP_LINKS) { + callbacks->link = NULL; + callbacks->autolink = NULL; + } + + if (render_flags & HTML_SKIP_HTML || render_flags & HTML_ESCAPE) + callbacks->blockhtml = NULL; +} diff --git a/SnudownTest/html/html.h b/SnudownTest/html/html.h new file mode 100644 index 0000000..59103b3 --- /dev/null +++ b/SnudownTest/html/html.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2011, Vicent Marti + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef UPSKIRT_HTML_H +#define UPSKIRT_HTML_H + +#include "markdown.h" +#include "buffer.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct html_renderopt { + struct { + int header_count; + int current_level; + int level_offset; + } toc_data; + + char* toc_id_prefix; + + unsigned int flags; + + char** html_element_whitelist; + char** html_attr_whitelist; + + /* extra callbacks */ + void (*link_attributes)(struct buf *ob, const struct buf *url, void *self); +}; + +typedef enum { + HTML_SKIP_HTML = (1 << 0), + HTML_SKIP_STYLE = (1 << 1), + HTML_SKIP_IMAGES = (1 << 2), + HTML_SKIP_LINKS = (1 << 3), + HTML_EXPAND_TABS = (1 << 4), + HTML_SAFELINK = (1 << 5), + HTML_TOC = (1 << 6), + HTML_HARD_WRAP = (1 << 7), + HTML_USE_XHTML = (1 << 8), + HTML_ESCAPE = (1 << 9), + HTML_ALLOW_ELEMENT_WHITELIST = (1 << 10), +} html_render_mode; + +typedef enum { + HTML_TAG_NONE = 0, + HTML_TAG_OPEN, + HTML_TAG_CLOSE, +} html_tag; + +int +sdhtml_is_tag(const uint8_t *tag_data, size_t tag_size, const char *tagname); + +extern void +sdhtml_renderer(struct sd_callbacks *callbacks, struct html_renderopt *options_ptr, unsigned int render_flags); + +extern void +sdhtml_toc_renderer(struct sd_callbacks *callbacks, struct html_renderopt *options_ptr); + +extern void +sdhtml_smartypants(struct buf *ob, const uint8_t *text, size_t size); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/SnudownTest/html/html_smartypants.c b/SnudownTest/html/html_smartypants.c new file mode 100644 index 0000000..4db8f02 --- /dev/null +++ b/SnudownTest/html/html_smartypants.c @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2011, Vicent Marti + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "buffer.h" +#include "html.h" + +#include +#include +#include +#include + +#if defined(_WIN32) +#define snprintf _snprintf +#endif + +struct smartypants_data { + int in_squote; + int in_dquote; +}; + +static size_t smartypants_cb__ltag(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__dquote(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__amp(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__period(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__number(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__dash(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__parens(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__squote(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__backtick(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__escape(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); + +static size_t (*smartypants_cb_ptrs[]) + (struct buf *, struct smartypants_data *, uint8_t, const uint8_t *, size_t) = +{ + NULL, /* 0 */ + smartypants_cb__dash, /* 1 */ + smartypants_cb__parens, /* 2 */ + smartypants_cb__squote, /* 3 */ + smartypants_cb__dquote, /* 4 */ + smartypants_cb__amp, /* 5 */ + smartypants_cb__period, /* 6 */ + smartypants_cb__number, /* 7 */ + smartypants_cb__ltag, /* 8 */ + smartypants_cb__backtick, /* 9 */ + smartypants_cb__escape, /* 10 */ +}; + +static const uint8_t smartypants_cb_chars[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 4, 0, 0, 0, 5, 3, 2, 0, 0, 0, 0, 1, 6, 0, + 0, 7, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static inline int +word_boundary(uint8_t c) +{ + return c == 0 || isspace(c) || ispunct(c); +} + +static int +smartypants_quotes(struct buf *ob, uint8_t previous_char, uint8_t next_char, uint8_t quote, int *is_open) +{ + char ent[8]; + + if (*is_open && !word_boundary(next_char)) + return 0; + + if (!(*is_open) && !word_boundary(previous_char)) + return 0; + + snprintf(ent, sizeof(ent), "&%c%cquo;", (*is_open) ? 'r' : 'l', quote); + *is_open = !(*is_open); + bufputs(ob, ent); + return 1; +} + +static size_t +smartypants_cb__squote(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (size >= 2) { + uint8_t t1 = tolower(text[1]); + + if (t1 == '\'') { + if (smartypants_quotes(ob, previous_char, size >= 3 ? text[2] : 0, 'd', &smrt->in_dquote)) + return 1; + } + + if ((t1 == 's' || t1 == 't' || t1 == 'm' || t1 == 'd') && + (size == 3 || word_boundary(text[2]))) { + BUFPUTSL(ob, "’"); + return 0; + } + + if (size >= 3) { + uint8_t t2 = tolower(text[2]); + + if (((t1 == 'r' && t2 == 'e') || + (t1 == 'l' && t2 == 'l') || + (t1 == 'v' && t2 == 'e')) && + (size == 4 || word_boundary(text[3]))) { + BUFPUTSL(ob, "’"); + return 0; + } + } + } + + if (smartypants_quotes(ob, previous_char, size > 0 ? text[1] : 0, 's', &smrt->in_squote)) + return 0; + + bufputc(ob, text[0]); + return 0; +} + +static size_t +smartypants_cb__parens(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (size >= 3) { + uint8_t t1 = tolower(text[1]); + uint8_t t2 = tolower(text[2]); + + if (t1 == 'c' && t2 == ')') { + BUFPUTSL(ob, "©"); + return 2; + } + + if (t1 == 'r' && t2 == ')') { + BUFPUTSL(ob, "®"); + return 2; + } + + if (size >= 4 && t1 == 't' && t2 == 'm' && text[3] == ')') { + BUFPUTSL(ob, "™"); + return 3; + } + } + + bufputc(ob, text[0]); + return 0; +} + +static size_t +smartypants_cb__dash(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (size >= 3 && text[1] == '-' && text[2] == '-') { + BUFPUTSL(ob, "—"); + return 2; + } + + if (size >= 2 && text[1] == '-') { + BUFPUTSL(ob, "–"); + return 1; + } + + bufputc(ob, text[0]); + return 0; +} + +static size_t +smartypants_cb__amp(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (size >= 6 && memcmp(text, """, 6) == 0) { + if (smartypants_quotes(ob, previous_char, size >= 7 ? text[6] : 0, 'd', &smrt->in_dquote)) + return 5; + } + + if (size >= 4 && memcmp(text, "�", 4) == 0) + return 3; + + bufputc(ob, '&'); + return 0; +} + +static size_t +smartypants_cb__period(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (size >= 3 && text[1] == '.' && text[2] == '.') { + BUFPUTSL(ob, "…"); + return 2; + } + + if (size >= 5 && text[1] == ' ' && text[2] == '.' && text[3] == ' ' && text[4] == '.') { + BUFPUTSL(ob, "…"); + return 4; + } + + bufputc(ob, text[0]); + return 0; +} + +static size_t +smartypants_cb__backtick(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (size >= 2 && text[1] == '`') { + if (smartypants_quotes(ob, previous_char, size >= 3 ? text[2] : 0, 'd', &smrt->in_dquote)) + return 1; + } + + return 0; +} + +static size_t +smartypants_cb__number(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (word_boundary(previous_char) && size >= 3) { + if (text[0] == '1' && text[1] == '/' && text[2] == '2') { + if (size == 3 || word_boundary(text[3])) { + BUFPUTSL(ob, "½"); + return 2; + } + } + + if (text[0] == '1' && text[1] == '/' && text[2] == '4') { + if (size == 3 || word_boundary(text[3]) || + (size >= 5 && tolower(text[3]) == 't' && tolower(text[4]) == 'h')) { + BUFPUTSL(ob, "¼"); + return 2; + } + } + + if (text[0] == '3' && text[1] == '/' && text[2] == '4') { + if (size == 3 || word_boundary(text[3]) || + (size >= 6 && tolower(text[3]) == 't' && tolower(text[4]) == 'h' && tolower(text[5]) == 's')) { + BUFPUTSL(ob, "¾"); + return 2; + } + } + } + + bufputc(ob, text[0]); + return 0; +} + +static size_t +smartypants_cb__dquote(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (!smartypants_quotes(ob, previous_char, size > 0 ? text[1] : 0, 'd', &smrt->in_dquote)) + BUFPUTSL(ob, """); + + return 0; +} + +static size_t +smartypants_cb__ltag(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + static const char *skip_tags[] = { + "pre", "code", "var", "samp", "kbd", "math", "script", "style" + }; + static const size_t skip_tags_count = 8; + + size_t tag, i = 0; + + while (i < size && text[i] != '>') + i++; + + for (tag = 0; tag < skip_tags_count; ++tag) { + if (sdhtml_is_tag(text, size, skip_tags[tag]) == HTML_TAG_OPEN) + break; + } + + if (tag < skip_tags_count) { + for (;;) { + while (i < size && text[i] != '<') + i++; + + if (i == size) + break; + + if (sdhtml_is_tag(text + i, size - i, skip_tags[tag]) == HTML_TAG_CLOSE) + break; + + i++; + } + + while (i < size && text[i] != '>') + i++; + } + + bufput(ob, text, i + 1); + return i; +} + +static size_t +smartypants_cb__escape(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (size < 2) + return 0; + + switch (text[1]) { + case '\\': + case '"': + case '\'': + case '.': + case '-': + case '`': + bufputc(ob, text[1]); + return 1; + + default: + bufputc(ob, '\\'); + return 0; + } +} + +#if 0 +static struct { + uint8_t c0; + const uint8_t *pattern; + const uint8_t *entity; + int skip; +} smartypants_subs[] = { + { '\'', "'s>", "’", 0 }, + { '\'', "'t>", "’", 0 }, + { '\'', "'re>", "’", 0 }, + { '\'', "'ll>", "’", 0 }, + { '\'', "'ve>", "’", 0 }, + { '\'', "'m>", "’", 0 }, + { '\'', "'d>", "’", 0 }, + { '-', "--", "—", 1 }, + { '-', "<->", "–", 0 }, + { '.', "...", "…", 2 }, + { '.', ". . .", "…", 4 }, + { '(', "(c)", "©", 2 }, + { '(', "(r)", "®", 2 }, + { '(', "(tm)", "™", 3 }, + { '3', "<3/4>", "¾", 2 }, + { '3', "<3/4ths>", "¾", 2 }, + { '1', "<1/2>", "½", 2 }, + { '1', "<1/4>", "¼", 2 }, + { '1', "<1/4th>", "¼", 2 }, + { '&', "�", 0, 3 }, +}; +#endif + +void +sdhtml_smartypants(struct buf *ob, const uint8_t *text, size_t size) +{ + size_t i; + struct smartypants_data smrt = {0, 0}; + + if (!text) + return; + + bufgrow(ob, size); + + for (i = 0; i < size; ++i) { + size_t org; + uint8_t action = 0; + + org = i; + while (i < size && (action = smartypants_cb_chars[text[i]]) == 0) + i++; + + if (i > org) + bufput(ob, text + org, i - org); + + if (i < size) { + i += smartypants_cb_ptrs[(int)action] + (ob, &smrt, i ? text[i - 1] : 0, text + i, size - i); + } + } +} + + diff --git a/SnudownTest/html_block_names.txt b/SnudownTest/html_block_names.txt new file mode 100644 index 0000000..a41d7d1 --- /dev/null +++ b/SnudownTest/html_block_names.txt @@ -0,0 +1,25 @@ +## +p +dl +h1 +h2 +h3 +h4 +h5 +h6 +ol +ul +del +div +ins +pre +form +math +table +figure +iframe +script +style +fieldset +noscript +blockquote diff --git a/SnudownTest/html_blocks.h b/SnudownTest/html_blocks.h new file mode 100644 index 0000000..09a758f --- /dev/null +++ b/SnudownTest/html_blocks.h @@ -0,0 +1,206 @@ +/* C code produced by gperf version 3.0.3 */ +/* Command-line: gperf -N find_block_tag -H hash_block_tag -C -c -E --ignore-case html_block_names.txt */ +/* Computed positions: -k'1-2' */ + +#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ + && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \ + && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \ + && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \ + && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \ + && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \ + && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \ + && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \ + && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \ + && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \ + && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \ + && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \ + && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \ + && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \ + && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \ + && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \ + && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \ + && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \ + && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \ + && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \ + && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \ + && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \ + && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)) +/* The character set is not based on ISO-646. */ +error "gperf generated tables don't work with this execution character set. Please report a bug to ." +#endif + +/* maximum key range = 37, duplicates = 0 */ + +#ifndef GPERF_DOWNCASE +#define GPERF_DOWNCASE 1 +static unsigned char gperf_downcase[256] = + { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, + 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, + 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255 + }; +#endif + +#ifndef GPERF_CASE_STRNCMP +#define GPERF_CASE_STRNCMP 1 +static int +gperf_case_strncmp (s1, s2, n) + register const char *s1; + register const char *s2; + register unsigned int n; +{ + for (; n > 0;) + { + unsigned char c1 = gperf_downcase[(unsigned char)*s1++]; + unsigned char c2 = gperf_downcase[(unsigned char)*s2++]; + if (c1 != 0 && c1 == c2) + { + n--; + continue; + } + return (int)c1 - (int)c2; + } + return 0; +} +#endif + +#ifdef __GNUC__ +__inline +#else +#ifdef __cplusplus +inline +#endif +#endif +static unsigned int +hash_block_tag (str, len) + register const char *str; + register unsigned int len; +{ + static const unsigned char asso_values[] = + { + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 8, 30, 25, 20, 15, 10, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 0, 38, 0, 38, + 5, 5, 5, 15, 0, 38, 38, 0, 15, 10, + 0, 38, 38, 15, 0, 5, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 0, 38, + 0, 38, 5, 5, 5, 15, 0, 38, 38, 0, + 15, 10, 0, 38, 38, 15, 0, 5, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38 + }; + register int hval = len; + + switch (hval) + { + default: + hval += asso_values[(unsigned char)str[1]+1]; + /*FALLTHROUGH*/ + case 1: + hval += asso_values[(unsigned char)str[0]]; + break; + } + return hval; +} + +#ifdef __GNUC__ +__inline +#ifdef __GNUC_STDC_INLINE__ +__attribute__ ((__gnu_inline__)) +#endif +#endif +const char * +find_block_tag (str, len) + register const char *str; + register unsigned int len; +{ + enum + { + TOTAL_KEYWORDS = 24, + MIN_WORD_LENGTH = 1, + MAX_WORD_LENGTH = 10, + MIN_HASH_VALUE = 1, + MAX_HASH_VALUE = 37 + }; + + static const char * const wordlist[] = + { + "", + "p", + "dl", + "div", + "math", + "table", + "", + "ul", + "del", + "form", + "blockquote", + "figure", + "ol", + "fieldset", + "", + "h1", + "", + "h6", + "pre", + "", "", + "script", + "h5", + "noscript", + "", + "style", + "iframe", + "h4", + "ins", + "", "", "", + "h3", + "", "", "", "", + "h2" + }; + + if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) + { + register int key = hash_block_tag (str, len); + + if (key <= MAX_HASH_VALUE && key >= 0) + { + register const char *s = wordlist[key]; + + if ((((unsigned char)*str ^ (unsigned char)*s) & ~32) == 0 && !gperf_case_strncmp (str, s, len) && s[len] == '\0') + return s; + } + } + return 0; +} diff --git a/SnudownTest/html_entities.gperf b/SnudownTest/html_entities.gperf new file mode 100644 index 0000000..f94e3c9 --- /dev/null +++ b/SnudownTest/html_entities.gperf @@ -0,0 +1,292 @@ +%language=ANSI-C +%define lookup-function-name is_allowed_named_entity +%compare-strncmp +%readonly-tables +%define hash-function-name hash_html_entity +%enum +%includes +%{ +#include + +/* Parsers tend to choke on entities with values greater than this */ +const u_int32_t MAX_NUM_ENTITY_VAL = 0x10ffff; +/* Any numeric entity longer than this is obviously above MAX_NUM_ENTITY_VAL + * used to avoid dealing with overflows. */ +const size_t MAX_NUM_ENTITY_LEN = 7; + +inline int is_valid_numeric_entity(uint32_t entity_val) +{ + /* Some XML parsers will choke on entities with certain + * values (mostly control characters.) + * + * According to lxml these are all problematic: + * + * [xrange(0, 8), + * xrange(11, 12), + * xrange(14, 31), + * xrange(55296, 57343), + * xrange(65534, 65535)] + */ + return (entity_val > 8 + && (entity_val != 11 && entity_val != 12) + && (entity_val < 14 || entity_val > 31) + && (entity_val < 55296 || entity_val > 57343) + && (entity_val != 65534 && entity_val != 65535) + && entity_val <= MAX_NUM_ENTITY_VAL); +} + +%} +%% +Æ +Á + +À +Α +Å +à +Ä +Β +Ç +Χ +‡ +Δ +Ð +É +Ê +È +Ε +Η +Ë +Γ +Í +Î +Ì +Ι +Ï +Κ +Λ +Μ +Ñ +Ν +Œ +Ó +Ô +Ò +Ω +Ο +Ø +Õ +Ö +Φ +Π +″ +Ψ +Ρ +Š +Σ +Þ +Τ +Θ +Ú +Û +Ù +Υ +Ü +Ξ +Ý +Ÿ +Ζ +á +â +´ +æ +à +ℵ +α +& +∧ +∠ +' +å +≈ +ã +ä +„ +β +¦ +• +∩ +ç +¸ +¢ +χ +ˆ +♣ +≅ +© +↵ +∪ +¤ +⇓ +† +↓ +° +δ +♦ +÷ +é +ê +è +∅ +  +  +ε +≡ +η +ð +ë +€ +∃ +ƒ +∀ +½ +¼ +¾ +⁄ +γ +≥ +> +⇔ +↔ +♥ +… +í +î +¡ +ì +ℑ +∞ +∫ +ι +¿ +∈ +ï +κ +⇐ +λ +⟨ +« +← +⌈ +“ +≤ +⌊ +∗ +◊ +‎ +‹ +‘ +< +¯ +— +µ +· +− +μ +∇ +  +– +≠ +∋ +¬ +∉ +⊄ +ñ +ν +ó +ô +œ +ò +‾ +ω +ο +⊕ +∨ +ª +º +ø +õ +⊗ +ö +¶ +∂ +‰ +⊥ +φ +π +ϖ +± +£ +′ +∏ +∝ +ψ +" +⇒ +√ +⟩ +» +→ +⌉ +” +ℜ +® +⌋ +ρ +‏ +› +’ +‚ +š +⋅ +§ +­ +σ +ς +∼ +♠ +⊂ +⊆ +∑ +¹ +² +³ +⊃ +⊇ +ß +τ +∴ +θ +ϑ +  +þ +˜ +× +™ +⇑ +ú +↑ +û +ù +¨ +ϒ +υ +ü +℘ +ξ +ý +¥ +ÿ +ζ +‍ +‌ diff --git a/SnudownTest/html_smartypants.c b/SnudownTest/html_smartypants.c new file mode 100644 index 0000000..4db8f02 --- /dev/null +++ b/SnudownTest/html_smartypants.c @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2011, Vicent Marti + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "buffer.h" +#include "html.h" + +#include +#include +#include +#include + +#if defined(_WIN32) +#define snprintf _snprintf +#endif + +struct smartypants_data { + int in_squote; + int in_dquote; +}; + +static size_t smartypants_cb__ltag(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__dquote(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__amp(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__period(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__number(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__dash(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__parens(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__squote(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__backtick(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); +static size_t smartypants_cb__escape(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size); + +static size_t (*smartypants_cb_ptrs[]) + (struct buf *, struct smartypants_data *, uint8_t, const uint8_t *, size_t) = +{ + NULL, /* 0 */ + smartypants_cb__dash, /* 1 */ + smartypants_cb__parens, /* 2 */ + smartypants_cb__squote, /* 3 */ + smartypants_cb__dquote, /* 4 */ + smartypants_cb__amp, /* 5 */ + smartypants_cb__period, /* 6 */ + smartypants_cb__number, /* 7 */ + smartypants_cb__ltag, /* 8 */ + smartypants_cb__backtick, /* 9 */ + smartypants_cb__escape, /* 10 */ +}; + +static const uint8_t smartypants_cb_chars[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 4, 0, 0, 0, 5, 3, 2, 0, 0, 0, 0, 1, 6, 0, + 0, 7, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static inline int +word_boundary(uint8_t c) +{ + return c == 0 || isspace(c) || ispunct(c); +} + +static int +smartypants_quotes(struct buf *ob, uint8_t previous_char, uint8_t next_char, uint8_t quote, int *is_open) +{ + char ent[8]; + + if (*is_open && !word_boundary(next_char)) + return 0; + + if (!(*is_open) && !word_boundary(previous_char)) + return 0; + + snprintf(ent, sizeof(ent), "&%c%cquo;", (*is_open) ? 'r' : 'l', quote); + *is_open = !(*is_open); + bufputs(ob, ent); + return 1; +} + +static size_t +smartypants_cb__squote(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (size >= 2) { + uint8_t t1 = tolower(text[1]); + + if (t1 == '\'') { + if (smartypants_quotes(ob, previous_char, size >= 3 ? text[2] : 0, 'd', &smrt->in_dquote)) + return 1; + } + + if ((t1 == 's' || t1 == 't' || t1 == 'm' || t1 == 'd') && + (size == 3 || word_boundary(text[2]))) { + BUFPUTSL(ob, "’"); + return 0; + } + + if (size >= 3) { + uint8_t t2 = tolower(text[2]); + + if (((t1 == 'r' && t2 == 'e') || + (t1 == 'l' && t2 == 'l') || + (t1 == 'v' && t2 == 'e')) && + (size == 4 || word_boundary(text[3]))) { + BUFPUTSL(ob, "’"); + return 0; + } + } + } + + if (smartypants_quotes(ob, previous_char, size > 0 ? text[1] : 0, 's', &smrt->in_squote)) + return 0; + + bufputc(ob, text[0]); + return 0; +} + +static size_t +smartypants_cb__parens(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (size >= 3) { + uint8_t t1 = tolower(text[1]); + uint8_t t2 = tolower(text[2]); + + if (t1 == 'c' && t2 == ')') { + BUFPUTSL(ob, "©"); + return 2; + } + + if (t1 == 'r' && t2 == ')') { + BUFPUTSL(ob, "®"); + return 2; + } + + if (size >= 4 && t1 == 't' && t2 == 'm' && text[3] == ')') { + BUFPUTSL(ob, "™"); + return 3; + } + } + + bufputc(ob, text[0]); + return 0; +} + +static size_t +smartypants_cb__dash(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (size >= 3 && text[1] == '-' && text[2] == '-') { + BUFPUTSL(ob, "—"); + return 2; + } + + if (size >= 2 && text[1] == '-') { + BUFPUTSL(ob, "–"); + return 1; + } + + bufputc(ob, text[0]); + return 0; +} + +static size_t +smartypants_cb__amp(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (size >= 6 && memcmp(text, """, 6) == 0) { + if (smartypants_quotes(ob, previous_char, size >= 7 ? text[6] : 0, 'd', &smrt->in_dquote)) + return 5; + } + + if (size >= 4 && memcmp(text, "�", 4) == 0) + return 3; + + bufputc(ob, '&'); + return 0; +} + +static size_t +smartypants_cb__period(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (size >= 3 && text[1] == '.' && text[2] == '.') { + BUFPUTSL(ob, "…"); + return 2; + } + + if (size >= 5 && text[1] == ' ' && text[2] == '.' && text[3] == ' ' && text[4] == '.') { + BUFPUTSL(ob, "…"); + return 4; + } + + bufputc(ob, text[0]); + return 0; +} + +static size_t +smartypants_cb__backtick(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (size >= 2 && text[1] == '`') { + if (smartypants_quotes(ob, previous_char, size >= 3 ? text[2] : 0, 'd', &smrt->in_dquote)) + return 1; + } + + return 0; +} + +static size_t +smartypants_cb__number(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (word_boundary(previous_char) && size >= 3) { + if (text[0] == '1' && text[1] == '/' && text[2] == '2') { + if (size == 3 || word_boundary(text[3])) { + BUFPUTSL(ob, "½"); + return 2; + } + } + + if (text[0] == '1' && text[1] == '/' && text[2] == '4') { + if (size == 3 || word_boundary(text[3]) || + (size >= 5 && tolower(text[3]) == 't' && tolower(text[4]) == 'h')) { + BUFPUTSL(ob, "¼"); + return 2; + } + } + + if (text[0] == '3' && text[1] == '/' && text[2] == '4') { + if (size == 3 || word_boundary(text[3]) || + (size >= 6 && tolower(text[3]) == 't' && tolower(text[4]) == 'h' && tolower(text[5]) == 's')) { + BUFPUTSL(ob, "¾"); + return 2; + } + } + } + + bufputc(ob, text[0]); + return 0; +} + +static size_t +smartypants_cb__dquote(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (!smartypants_quotes(ob, previous_char, size > 0 ? text[1] : 0, 'd', &smrt->in_dquote)) + BUFPUTSL(ob, """); + + return 0; +} + +static size_t +smartypants_cb__ltag(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + static const char *skip_tags[] = { + "pre", "code", "var", "samp", "kbd", "math", "script", "style" + }; + static const size_t skip_tags_count = 8; + + size_t tag, i = 0; + + while (i < size && text[i] != '>') + i++; + + for (tag = 0; tag < skip_tags_count; ++tag) { + if (sdhtml_is_tag(text, size, skip_tags[tag]) == HTML_TAG_OPEN) + break; + } + + if (tag < skip_tags_count) { + for (;;) { + while (i < size && text[i] != '<') + i++; + + if (i == size) + break; + + if (sdhtml_is_tag(text + i, size - i, skip_tags[tag]) == HTML_TAG_CLOSE) + break; + + i++; + } + + while (i < size && text[i] != '>') + i++; + } + + bufput(ob, text, i + 1); + return i; +} + +static size_t +smartypants_cb__escape(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) +{ + if (size < 2) + return 0; + + switch (text[1]) { + case '\\': + case '"': + case '\'': + case '.': + case '-': + case '`': + bufputc(ob, text[1]); + return 1; + + default: + bufputc(ob, '\\'); + return 0; + } +} + +#if 0 +static struct { + uint8_t c0; + const uint8_t *pattern; + const uint8_t *entity; + int skip; +} smartypants_subs[] = { + { '\'', "'s>", "’", 0 }, + { '\'', "'t>", "’", 0 }, + { '\'', "'re>", "’", 0 }, + { '\'', "'ll>", "’", 0 }, + { '\'', "'ve>", "’", 0 }, + { '\'', "'m>", "’", 0 }, + { '\'', "'d>", "’", 0 }, + { '-', "--", "—", 1 }, + { '-', "<->", "–", 0 }, + { '.', "...", "…", 2 }, + { '.', ". . .", "…", 4 }, + { '(', "(c)", "©", 2 }, + { '(', "(r)", "®", 2 }, + { '(', "(tm)", "™", 3 }, + { '3', "<3/4>", "¾", 2 }, + { '3', "<3/4ths>", "¾", 2 }, + { '1', "<1/2>", "½", 2 }, + { '1', "<1/4>", "¼", 2 }, + { '1', "<1/4th>", "¼", 2 }, + { '&', "�", 0, 3 }, +}; +#endif + +void +sdhtml_smartypants(struct buf *ob, const uint8_t *text, size_t size) +{ + size_t i; + struct smartypants_data smrt = {0, 0}; + + if (!text) + return; + + bufgrow(ob, size); + + for (i = 0; i < size; ++i) { + size_t org; + uint8_t action = 0; + + org = i; + while (i < size && (action = smartypants_cb_chars[text[i]]) == 0) + i++; + + if (i > org) + bufput(ob, text + org, i - org); + + if (i < size) { + i += smartypants_cb_ptrs[(int)action] + (ob, &smrt, i ? text[i - 1] : 0, text + i, size - i); + } + } +} + + diff --git a/SnudownTest/markdown.c b/SnudownTest/markdown.c new file mode 100644 index 0000000..abe4a1d --- /dev/null +++ b/SnudownTest/markdown.c @@ -0,0 +1,2661 @@ +/* markdown.c - generic markdown parser */ + +/* + * Copyright (c) 2009, Natacha Porté + * Copyright (c) 2011, Vicent Marti + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "markdown.h" +#include "stack.h" + +#include +#include +#include +#include + +#if defined(_WIN32) +#define strncasecmp _strnicmp +#endif + +#define REF_TABLE_SIZE 8 + +#define BUFFER_BLOCK 0 +#define BUFFER_SPAN 1 + +#define MKD_LI_END 8 /* internal list flag */ + +#define gperf_case_strncmp(s1, s2, n) strncasecmp(s1, s2, n) +#define GPERF_DOWNCASE 1 +#define GPERF_CASE_STRNCMP 1 +#include "html_blocks.h" +#include "html_entities.h" + +/*************** + * LOCAL TYPES * + ***************/ + +/* link_ref: reference to a link */ +struct link_ref { + unsigned int id; + + struct buf *link; + struct buf *title; + + struct link_ref *next; +}; + +/* char_trigger: function pointer to render active chars */ +/* returns the number of chars taken care of */ +/* data is the pointer of the beginning of the span */ +/* offset is the number of valid chars before data */ +struct sd_markdown; +typedef size_t +(*char_trigger)(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); + +static size_t char_emphasis(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_linebreak(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_codespan(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_escape(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_entity(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_langle_tag(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_autolink_url(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_autolink_email(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_autolink_www(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_autolink_subreddit_or_username(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_link(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_superscript(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); + +enum markdown_char_t { + MD_CHAR_NONE = 0, + MD_CHAR_EMPHASIS, + MD_CHAR_CODESPAN, + MD_CHAR_LINEBREAK, + MD_CHAR_LINK, + MD_CHAR_LANGLE, + MD_CHAR_ESCAPE, + MD_CHAR_ENTITITY, + MD_CHAR_AUTOLINK_URL, + MD_CHAR_AUTOLINK_EMAIL, + MD_CHAR_AUTOLINK_WWW, + MD_CHAR_AUTOLINK_SUBREDDIT_OR_USERNAME, + MD_CHAR_SUPERSCRIPT, +}; + +static char_trigger markdown_char_ptrs[] = { + NULL, + &char_emphasis, + &char_codespan, + &char_linebreak, + &char_link, + &char_langle_tag, + &char_escape, + &char_entity, + &char_autolink_url, + &char_autolink_email, + &char_autolink_www, + &char_autolink_subreddit_or_username, + &char_superscript, +}; + +/* render • structure containing one particular render */ +struct sd_markdown { + struct sd_callbacks cb; + void *opaque; + + struct link_ref *refs[REF_TABLE_SIZE]; + uint8_t active_char[256]; + struct stack work_bufs[2]; + unsigned int ext_flags; + size_t max_nesting; + size_t max_table_cols; + int in_link_body; +}; + +/*************************** + * HELPER FUNCTIONS * + ***************************/ + +static inline struct buf * +rndr_newbuf(struct sd_markdown *rndr, int type) +{ + static const size_t buf_size[2] = {256, 64}; + struct buf *work = NULL; + struct stack *pool = &rndr->work_bufs[type]; + + if (pool->size < pool->asize && + pool->item[pool->size] != NULL) { + work = pool->item[pool->size++]; + work->size = 0; + } else { + work = bufnew(buf_size[type]); + stack_push(pool, work); + } + + return work; +} + +static inline void +rndr_popbuf(struct sd_markdown *rndr, int type) +{ + rndr->work_bufs[type].size--; +} + +static void +unscape_text(struct buf *ob, struct buf *src) +{ + size_t i = 0, org; + while (i < src->size) { + org = i; + while (i < src->size && src->data[i] != '\\') + i++; + + if (i > org) + bufput(ob, src->data + org, i - org); + + if (i + 1 >= src->size) + break; + + bufputc(ob, src->data[i + 1]); + i += 2; + } +} + +static unsigned int +hash_link_ref(const uint8_t *link_ref, size_t length) +{ + size_t i; + unsigned int hash = 0; + + for (i = 0; i < length; ++i) + hash = tolower(link_ref[i]) + (hash << 6) + (hash << 16) - hash; + + return hash; +} + +static struct link_ref * +add_link_ref( + struct link_ref **references, + const uint8_t *name, size_t name_size) +{ + struct link_ref *ref = calloc(1, sizeof(struct link_ref)); + + if (!ref) + return NULL; + + ref->id = hash_link_ref(name, name_size); + ref->next = references[ref->id % REF_TABLE_SIZE]; + + references[ref->id % REF_TABLE_SIZE] = ref; + return ref; +} + +static struct link_ref * +find_link_ref(struct link_ref **references, uint8_t *name, size_t length) +{ + unsigned int hash = hash_link_ref(name, length); + struct link_ref *ref = NULL; + + ref = references[hash % REF_TABLE_SIZE]; + + while (ref != NULL) { + if (ref->id == hash) + return ref; + + ref = ref->next; + } + + return NULL; +} + +static void +free_link_refs(struct link_ref **references) +{ + size_t i; + + for (i = 0; i < REF_TABLE_SIZE; ++i) { + struct link_ref *r = references[i]; + struct link_ref *next; + + while (r) { + next = r->next; + bufrelease(r->link); + bufrelease(r->title); + free(r); + r = next; + } + } +} + +/* + * Check whether a char is a Markdown space. + + * Right now we only consider spaces the actual + * space and a newline: tabs and carriage returns + * are filtered out during the preprocessing phase. + * + * If we wanted to actually be UTF-8 compliant, we + * should instead extract an Unicode codepoint from + * this character and check for space properties. + */ +static inline int +_isspace(int c) +{ + return c == ' ' || c == '\n'; +} + +/**************************** + * INLINE PARSING FUNCTIONS * + ****************************/ + +/* is_mail_autolink • looks for the address part of a mail autolink and '>' */ +/* this is less strict than the original markdown e-mail address matching */ +static size_t +is_mail_autolink(uint8_t *data, size_t size) +{ + size_t i = 0, nb = 0; + + /* address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@' */ + for (i = 0; i < size; ++i) { + if (isalnum(data[i])) + continue; + + switch (data[i]) { + case '@': + nb++; + + case '-': + case '.': + case '_': + break; + + case '>': + return (nb == 1) ? i + 1 : 0; + + default: + return 0; + } + } + + return 0; +} + +/* tag_length • returns the length of the given tag, or 0 is it's not valid */ +static size_t +tag_length(uint8_t *data, size_t size, enum mkd_autolink *autolink) +{ + size_t i, j; + + /* a valid tag can't be shorter than 3 chars */ + if (size < 3) return 0; + + /* begins with a '<' optionally followed by '/', followed by letter or number */ + if (data[0] != '<') return 0; + i = (data[1] == '/') ? 2 : 1; + + if (!isalnum(data[i])) + return 0; + + /* scheme test */ + *autolink = MKDA_NOT_AUTOLINK; + + /* try to find the beginning of an URI */ + while (i < size && (isalnum(data[i]) || data[i] == '.' || data[i] == '+' || data[i] == '-')) + i++; + + if (i > 1 && data[i] == '@') { + if ((j = is_mail_autolink(data + i, size - i)) != 0) { + *autolink = MKDA_EMAIL; + return i + j; + } + } + + if (i > 2 && data[i] == ':') { + *autolink = MKDA_NORMAL; + i++; + } + + /* completing autolink test: no whitespace or ' or " */ + if (i >= size) + *autolink = MKDA_NOT_AUTOLINK; + + else if (*autolink) { + j = i; + + while (i < size) { + if (data[i] == '\\') i += 2; + else if (data[i] == '>' || data[i] == '\'' || + data[i] == '"' || data[i] == ' ' || data[i] == '\n') + break; + else i++; + } + + if (i >= size) return 0; + if (i > j && data[i] == '>') return i + 1; + /* one of the forbidden chars has been found */ + *autolink = MKDA_NOT_AUTOLINK; + } + + /* looking for sometinhg looking like a tag end */ + while (i < size && data[i] != '>') i++; + if (i >= size) return 0; + return i + 1; +} + +/* parse_inline • parses inline markdown elements */ +static void +parse_inline(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size) +{ + size_t i = 0, end = 0, last_special = 0; + uint8_t action = 0; + struct buf work = { 0, 0, 0, 0 }; + + if (rndr->work_bufs[BUFFER_SPAN].size + + rndr->work_bufs[BUFFER_BLOCK].size > rndr->max_nesting) + return; + + while (i < size) { + /* copying inactive chars into the output */ + while (end < size && (action = rndr->active_char[data[end]]) == 0) { + end++; + } + + if (rndr->cb.normal_text) { + work.data = data + i; + work.size = end - i; + rndr->cb.normal_text(ob, &work, rndr->opaque); + } + else + bufput(ob, data + i, end - i); + + if (end >= size) break; + i = end; + + end = markdown_char_ptrs[(int)action](ob, rndr, data + i, i - last_special, i, size - i); + if (!end) /* no action from the callback */ + end = i + 1; + else { + i += end; + last_special = end = i; + } + } +} + +/* find_emph_char • looks for the next emph uint8_t, skipping other constructs */ +static size_t +find_emph_char(uint8_t *data, size_t size, uint8_t c) +{ + size_t i = 1; + + while (i < size) { + while (i < size && data[i] != c && data[i] != '`' && data[i] != '[') + i++; + + if (i == size) + return 0; + + if (data[i] == c) + return i; + + /* not counting escaped chars */ + if (i && data[i - 1] == '\\') { + i++; continue; + } + + if (data[i] == '`') { + size_t span_nb = 0, bt; + size_t tmp_i = 0; + + /* counting the number of opening backticks */ + while (i < size && data[i] == '`') { + i++; span_nb++; + } + + if (i >= size) return 0; + + /* finding the matching closing sequence */ + bt = 0; + while (i < size && bt < span_nb) { + if (!tmp_i && data[i] == c) tmp_i = i; + if (data[i] == '`') bt++; + else bt = 0; + i++; + } + + if (i >= size) return tmp_i; + } + /* skipping a link */ + else if (data[i] == '[') { + size_t tmp_i = 0; + uint8_t cc; + + i++; + while (i < size && data[i] != ']') { + if (!tmp_i && data[i] == c) tmp_i = i; + i++; + } + + i++; + while (i < size && (data[i] == ' ' || data[i] == '\n')) + i++; + + if (i >= size) + return tmp_i; + + switch (data[i]) { + case '[': + cc = ']'; break; + + case '(': + cc = ')'; break; + + default: + if (tmp_i) + return tmp_i; + else + continue; + } + + i++; + while (i < size && data[i] != cc) { + if (!tmp_i && data[i] == c) tmp_i = i; + i++; + } + + if (i >= size) + return tmp_i; + + i++; + } + } + + return 0; +} + +/* parse_emph1 • parsing single emphase */ +/* closed by a symbol not preceded by whitespace and not followed by symbol */ +static size_t +parse_emph1(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size, uint8_t c) +{ + size_t i = 0, len; + struct buf *work = 0; + int r; + + if (!rndr->cb.emphasis) return 0; + + /* skipping one symbol if coming from emph3 */ + if (size > 1 && data[0] == c && data[1] == c) i = 1; + + while (i < size) { + len = find_emph_char(data + i, size - i, c); + if (!len) return 0; + i += len; + if (i >= size) return 0; + + if (data[i] == c && !_isspace(data[i - 1])) { + if ((rndr->ext_flags & MKDEXT_NO_INTRA_EMPHASIS) && (c == '_')) { + if (!(i + 1 == size || _isspace(data[i + 1]) || ispunct(data[i + 1]))) + continue; + } + + work = rndr_newbuf(rndr, BUFFER_SPAN); + parse_inline(work, rndr, data, i); + r = rndr->cb.emphasis(ob, work, rndr->opaque); + rndr_popbuf(rndr, BUFFER_SPAN); + return r ? i + 1 : 0; + } + } + + return 0; +} + +/* parse_emph2 • parsing single emphase */ +static size_t +parse_emph2(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size, uint8_t c) +{ + int (*render_method)(struct buf *ob, const struct buf *text, void *opaque); + size_t i = 0, len; + struct buf *work = 0; + int r; + + render_method = (c == '~') ? rndr->cb.strikethrough : rndr->cb.double_emphasis; + + if (!render_method) + return 0; + + while (i < size) { + len = find_emph_char(data + i, size - i, c); + if (!len) return 0; + i += len; + + if (i + 1 < size && data[i] == c && data[i + 1] == c && i && !_isspace(data[i - 1])) { + work = rndr_newbuf(rndr, BUFFER_SPAN); + parse_inline(work, rndr, data, i); + r = render_method(ob, work, rndr->opaque); + rndr_popbuf(rndr, BUFFER_SPAN); + return r ? i + 2 : 0; + } + i++; + } + return 0; +} + +/* parse_emph3 • parsing single emphase */ +/* finds the first closing tag, and delegates to the other emph */ +static size_t +parse_emph3(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size, uint8_t c) +{ + size_t i = 0, len; + int r; + + while (i < size) { + len = find_emph_char(data + i, size - i, c); + if (!len) return 0; + i += len; + + /* skip whitespace preceded symbols */ + if (data[i] != c || _isspace(data[i - 1])) + continue; + + if (i + 2 < size && data[i + 1] == c && data[i + 2] == c && rndr->cb.triple_emphasis) { + /* triple symbol found */ + struct buf *work = rndr_newbuf(rndr, BUFFER_SPAN); + + parse_inline(work, rndr, data, i); + r = rndr->cb.triple_emphasis(ob, work, rndr->opaque); + rndr_popbuf(rndr, BUFFER_SPAN); + return r ? i + 3 : 0; + + } else if (i + 1 < size && data[i + 1] == c) { + /* double symbol found, handing over to emph1 */ + len = parse_emph1(ob, rndr, data - 2, size + 2, c); + if (!len) return 0; + else return len - 2; + + } else { + /* single symbol found, handing over to emph2 */ + len = parse_emph2(ob, rndr, data - 1, size + 1, c); + if (!len) return 0; + else return len - 1; + } + } + return 0; +} + +/* char_emphasis • single and double emphasis parsing */ +static size_t +char_emphasis(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + uint8_t c = data[0]; + size_t ret; + + if (size > 2 && data[1] != c) { + /* whitespace cannot follow an opening emphasis; + * strikethrough only takes two characters '~~' */ + if (c == '~' || _isspace(data[1]) || (ret = parse_emph1(ob, rndr, data + 1, size - 1, c)) == 0) + return 0; + + return ret + 1; + } + + if (size > 3 && data[1] == c && data[2] != c) { + if (_isspace(data[2]) || (ret = parse_emph2(ob, rndr, data + 2, size - 2, c)) == 0) + return 0; + + return ret + 2; + } + + if (size > 4 && data[1] == c && data[2] == c && data[3] != c) { + if (c == '~' || _isspace(data[3]) || (ret = parse_emph3(ob, rndr, data + 3, size - 3, c)) == 0) + return 0; + + return ret + 3; + } + + return 0; +} + + +/* char_linebreak • '\n' preceded by two spaces (assuming linebreak != 0) */ +static size_t +char_linebreak(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + if (max_rewind < 2 || data[-1] != ' ' || data[-2] != ' ') + return 0; + + /* removing the last space from ob and rendering */ + while (ob->size && ob->data[ob->size - 1] == ' ') + ob->size--; + + return rndr->cb.linebreak(ob, rndr->opaque) ? 1 : 0; +} + + +/* char_codespan • '`' parsing a code span (assuming codespan != 0) */ +static size_t +char_codespan(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + size_t end, nb = 0, i, f_begin, f_end; + + /* counting the number of backticks in the delimiter */ + while (nb < size && data[nb] == '`') + nb++; + + /* finding the next delimiter */ + i = 0; + for (end = nb; end < size && i < nb; end++) { + if (data[end] == '`') i++; + else i = 0; + } + + if (i < nb && end >= size) + return 0; /* no matching delimiter */ + + /* trimming outside whitespaces */ + f_begin = nb; + while (f_begin < end && data[f_begin] == ' ') + f_begin++; + + f_end = end - nb; + while (f_end > nb && data[f_end-1] == ' ') + f_end--; + + /* real code span */ + if (f_begin < f_end) { + struct buf work = { data + f_begin, f_end - f_begin, 0, 0 }; + if (!rndr->cb.codespan(ob, &work, rndr->opaque)) + end = 0; + } else { + if (!rndr->cb.codespan(ob, 0, rndr->opaque)) + end = 0; + } + + return end; +} + + +/* char_escape • '\\' backslash escape */ +static size_t +char_escape(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + static const char *escape_chars = "\\`*_{}[]()#+-.!:|&<>/^~"; + struct buf work = { 0, 0, 0, 0 }; + + if (size > 1) { + if (strchr(escape_chars, data[1]) == NULL) + return 0; + + if (rndr->cb.normal_text) { + work.data = data + 1; + work.size = 1; + rndr->cb.normal_text(ob, &work, rndr->opaque); + } + else bufputc(ob, data[1]); + } else if (size == 1) { + bufputc(ob, data[0]); + } + + return 2; +} + +/* char_entity • '&' escaped when it doesn't belong to an entity */ +static size_t +char_entity(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + size_t end = 1; + size_t content_start; + size_t content_end; + struct buf work = { 0, 0, 0, 0 }; + int numeric = 0; + int hex = 0; + int entity_base; + uint32_t entity_val; + + if (end < size && data[end] == '#') { + numeric = 1; + end++; + } + + if (end < size && numeric && tolower(data[end]) == 'x') { + hex = 1; + end++; + } + + content_start = end; + + while (end < size) { + const char c = data[end]; + if (hex) { + if (!isxdigit(c)) break; + } else if (numeric) { + if (!isdigit(c)) break; + } else if (!isalnum(c)) { + break; + } + end++; + } + + content_end = end; + + if (end > content_start && end < size && data[end] == ';') + end++; /* well-formed entity */ + else + return 0; /* not an entity */ + + /* way too long to be a valid numeric entity */ + if (numeric && content_end - content_start > MAX_NUM_ENTITY_LEN) + return 0; + + /* Validate the entity's contents */ + if (numeric) { + if (hex) + entity_base = 16; + else + entity_base = 10; + + // This is ok because it'll stop once it hits the ';' + entity_val = strtol((char*)data + content_start, NULL, entity_base); + if (!is_valid_numeric_entity(entity_val)) + return 0; + } else { + if (!is_allowed_named_entity((const char *)data, end)) + return 0; + } + + if (rndr->cb.entity) { + work.data = data; + work.size = end; + rndr->cb.entity(ob, &work, rndr->opaque); + } else { + /* Necessary so we can normalize `>` to `>` */ + bufputc(ob, '&'); + if (numeric) + bufputc(ob, '#'); + if (hex) + bufputc(ob, 'x'); + bufput(ob, data + content_start, end - content_start); + } + + return end; +} + +/* char_langle_tag • '<' when tags or autolinks are allowed */ +static size_t +char_langle_tag(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + enum mkd_autolink altype = MKDA_NOT_AUTOLINK; + size_t end = tag_length(data, size, &altype); + struct buf work = { data, end, 0, 0 }; + int ret = 0; + + if (end > 2) { + if (rndr->cb.autolink && altype != MKDA_NOT_AUTOLINK) { + struct buf *u_link = rndr_newbuf(rndr, BUFFER_SPAN); + work.data = data + 1; + work.size = end - 2; + unscape_text(u_link, &work); + ret = rndr->cb.autolink(ob, u_link, altype, rndr->opaque); + rndr_popbuf(rndr, BUFFER_SPAN); + } + else if (rndr->cb.raw_html_tag) + ret = rndr->cb.raw_html_tag(ob, &work, rndr->opaque); + } + + if (!ret) return 0; + else return end; +} + +static size_t +char_autolink_www(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + struct buf *link, *link_url, *link_text; + size_t link_len, rewind; + + if (!rndr->cb.link || rndr->in_link_body) + return 0; + + link = rndr_newbuf(rndr, BUFFER_SPAN); + + if ((link_len = sd_autolink__www(&rewind, link, data, max_rewind, size, 0)) > 0) { + link_url = rndr_newbuf(rndr, BUFFER_SPAN); + BUFPUTSL(link_url, "http://"); + bufput(link_url, link->data, link->size); + + buftruncate(ob, ob->size - rewind); + if (rndr->cb.normal_text) { + link_text = rndr_newbuf(rndr, BUFFER_SPAN); + rndr->cb.normal_text(link_text, link, rndr->opaque); + rndr->cb.link(ob, link_url, NULL, link_text, rndr->opaque); + rndr_popbuf(rndr, BUFFER_SPAN); + } else { + rndr->cb.link(ob, link_url, NULL, link, rndr->opaque); + } + rndr_popbuf(rndr, BUFFER_SPAN); + } + + rndr_popbuf(rndr, BUFFER_SPAN); + return link_len; +} + +static size_t +char_autolink_subreddit_or_username(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + struct buf *link, *link_text, *link_url; + size_t link_len, rewind; + int no_slash; + + if (!rndr->cb.autolink || rndr->in_link_body) + return 0; + + link = rndr_newbuf(rndr, BUFFER_SPAN); + + link_len = sd_autolink__subreddit(&rewind, link, data, max_rewind, max_lookbehind, size, &no_slash); + if (link_len == 0) + link_len = sd_autolink__username(&rewind, link, data, max_rewind, max_lookbehind, size, &no_slash); + + /* Found either a user or subreddit link */ + if (link_len > 0) { + link_url = rndr_newbuf(rndr, BUFFER_SPAN); + if (no_slash) + bufputc(link_url, '/'); + bufput(link_url, link->data, link->size); + + buftruncate(ob, ob->size - rewind); + if (rndr->cb.normal_text) { + link_text = rndr_newbuf(rndr, BUFFER_SPAN); + rndr->cb.normal_text(link_text, link, rndr->opaque); + rndr->cb.link(ob, link_url, NULL, link_text, rndr->opaque); + rndr_popbuf(rndr, BUFFER_SPAN); + } else { + rndr->cb.link(ob, link_url, NULL, link, rndr->opaque); + } + rndr_popbuf(rndr, BUFFER_SPAN); + } + rndr_popbuf(rndr, BUFFER_SPAN); + + return link_len; +} + +static size_t +char_autolink_email(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + struct buf *link; + size_t link_len, rewind; + + if (!rndr->cb.autolink || rndr->in_link_body) + return 0; + + link = rndr_newbuf(rndr, BUFFER_SPAN); + + if ((link_len = sd_autolink__email(&rewind, link, data, max_rewind, size, 0)) > 0) { + buftruncate(ob, ob->size - rewind); + rndr->cb.autolink(ob, link, MKDA_EMAIL, rndr->opaque); + } + + rndr_popbuf(rndr, BUFFER_SPAN); + return link_len; +} + +static size_t +char_autolink_url(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + struct buf *link; + size_t link_len, rewind; + + if (!rndr->cb.autolink || rndr->in_link_body) + return 0; + + link = rndr_newbuf(rndr, BUFFER_SPAN); + + if ((link_len = sd_autolink__url(&rewind, link, data, max_rewind, size, 0)) > 0) { + buftruncate(ob, ob->size - rewind); + rndr->cb.autolink(ob, link, MKDA_NORMAL, rndr->opaque); + } + + rndr_popbuf(rndr, BUFFER_SPAN); + return link_len; +} + +/* char_link • '[': parsing a link or an image */ +static size_t +char_link(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + int is_img = (max_rewind && data[-1] == '!'), level; + size_t i = 1, txt_e, link_b = 0, link_e = 0, title_b = 0, title_e = 0; + struct buf *content = 0; + struct buf *link = 0; + struct buf *title = 0; + struct buf *u_link = 0; + size_t org_work_size = rndr->work_bufs[BUFFER_SPAN].size; + int text_has_nl = 0, ret = 0; + int in_title = 0, qtype = 0; + + /* checking whether the correct renderer exists */ + if ((is_img && !rndr->cb.image) || (!is_img && !rndr->cb.link)) + goto cleanup; + + /* looking for the matching closing bracket */ + for (level = 1; i < size; i++) { + if (data[i] == '\n') + text_has_nl = 1; + + else if (data[i - 1] == '\\') + continue; + + else if (data[i] == '[') + level++; + + else if (data[i] == ']') { + level--; + if (level <= 0) + break; + } + } + + if (i >= size) + goto cleanup; + + txt_e = i; + i++; + + /* skip any amount of whitespace or newline */ + /* (this is much more laxist than original markdown syntax) */ + while (i < size && _isspace(data[i])) + i++; + + /* inline style link */ + if (i < size && data[i] == '(') { + /* skipping initial whitespace */ + i++; + + while (i < size && _isspace(data[i])) + i++; + + link_b = i; + + /* looking for link end: ' " ) */ + while (i < size) { + if (data[i] == '\\') i += 2; + else if (data[i] == ')') break; + else if (i >= 1 && _isspace(data[i-1]) && (data[i] == '\'' || data[i] == '"')) break; + else i++; + } + + if (i >= size) goto cleanup; + link_e = i; + + /* looking for title end if present */ + if (data[i] == '\'' || data[i] == '"') { + qtype = data[i]; + in_title = 1; + i++; + title_b = i; + + while (i < size) { + if (data[i] == '\\') i += 2; + else if (data[i] == qtype) {in_title = 0; i++;} + else if ((data[i] == ')') && !in_title) break; + else i++; + } + + if (i >= size) goto cleanup; + + /* skipping whitespaces after title */ + title_e = i - 1; + while (title_e > title_b && _isspace(data[title_e])) + title_e--; + + /* checking for closing quote presence */ + if (data[title_e] != '\'' && data[title_e] != '"') { + title_b = title_e = 0; + link_e = i; + } + } + + /* remove whitespace at the end of the link */ + while (link_e > link_b && _isspace(data[link_e - 1])) + link_e--; + + /* remove optional angle brackets around the link */ + if (data[link_b] == '<') link_b++; + if (data[link_e - 1] == '>') link_e--; + + /* building escaped link and title */ + if (link_e > link_b) { + link = rndr_newbuf(rndr, BUFFER_SPAN); + bufput(link, data + link_b, link_e - link_b); + } + + if (title_e > title_b) { + title = rndr_newbuf(rndr, BUFFER_SPAN); + bufput(title, data + title_b, title_e - title_b); + } + + i++; + } + + /* reference style link */ + else if (i < size && data[i] == '[') { + struct buf id = { 0, 0, 0, 0 }; + struct link_ref *lr; + + /* looking for the id */ + i++; + link_b = i; + while (i < size && data[i] != ']') i++; + if (i >= size) goto cleanup; + link_e = i; + + /* finding the link_ref */ + if (link_b == link_e) { + if (text_has_nl) { + struct buf *b = rndr_newbuf(rndr, BUFFER_SPAN); + size_t j; + + for (j = 1; j < txt_e; j++) { + if (data[j] != '\n') + bufputc(b, data[j]); + else if (data[j - 1] != ' ') + bufputc(b, ' '); + } + + id.data = b->data; + id.size = b->size; + } else { + id.data = data + 1; + id.size = txt_e - 1; + } + } else { + id.data = data + link_b; + id.size = link_e - link_b; + } + + lr = find_link_ref(rndr->refs, id.data, id.size); + if (!lr) + goto cleanup; + + /* keeping link and title from link_ref */ + link = lr->link; + title = lr->title; + i++; + } + + /* shortcut reference style link */ + else { + struct buf id = { 0, 0, 0, 0 }; + struct link_ref *lr; + + /* crafting the id */ + if (text_has_nl) { + struct buf *b = rndr_newbuf(rndr, BUFFER_SPAN); + size_t j; + + for (j = 1; j < txt_e; j++) { + if (data[j] != '\n') + bufputc(b, data[j]); + else if (data[j - 1] != ' ') + bufputc(b, ' '); + } + + id.data = b->data; + id.size = b->size; + } else { + id.data = data + 1; + id.size = txt_e - 1; + } + + /* finding the link_ref */ + lr = find_link_ref(rndr->refs, id.data, id.size); + if (!lr) + goto cleanup; + + /* keeping link and title from link_ref */ + link = lr->link; + title = lr->title; + + /* rewinding the whitespace */ + i = txt_e + 1; + } + + /* building content: img alt is escaped, link content is parsed */ + if (txt_e > 1) { + content = rndr_newbuf(rndr, BUFFER_SPAN); + if (is_img) { + bufput(content, data + 1, txt_e - 1); + } else { + /* disable autolinking when parsing inline the + * content of a link */ + rndr->in_link_body = 1; + parse_inline(content, rndr, data + 1, txt_e - 1); + rndr->in_link_body = 0; + } + } + + if (link) { + u_link = rndr_newbuf(rndr, BUFFER_SPAN); + unscape_text(u_link, link); + } else { + goto cleanup; + } + + /* calling the relevant rendering function */ + if (is_img) { + if (ob->size && ob->data[ob->size - 1] == '!') + ob->size -= 1; + + ret = rndr->cb.image(ob, u_link, title, content, rndr->opaque); + } else { + ret = rndr->cb.link(ob, u_link, title, content, rndr->opaque); + } + + /* cleanup */ +cleanup: + rndr->work_bufs[BUFFER_SPAN].size = (int)org_work_size; + return ret ? i : 0; +} + +static size_t +char_superscript(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + size_t sup_start, sup_len; + struct buf *sup; + + if (!rndr->cb.superscript) + return 0; + + if (size < 2) + return 0; + + if (data[1] == '(') { + sup_start = sup_len = 2; + + while (sup_len < size && data[sup_len] != ')' && data[sup_len - 1] != '\\') + sup_len++; + + if (sup_len == size) + return 0; + } else { + sup_start = sup_len = 1; + + while (sup_len < size && !_isspace(data[sup_len])) + sup_len++; + } + + if (sup_len - sup_start == 0) + return (sup_start == 2) ? 3 : 0; + + sup = rndr_newbuf(rndr, BUFFER_SPAN); + parse_inline(sup, rndr, data + sup_start, sup_len - sup_start); + rndr->cb.superscript(ob, sup, rndr->opaque); + rndr_popbuf(rndr, BUFFER_SPAN); + + return (sup_start == 2) ? sup_len + 1 : sup_len; +} + +/********************************* + * BLOCK-LEVEL PARSING FUNCTIONS * + *********************************/ + +/* is_empty • returns the line length when it is empty, 0 otherwise */ +static size_t +is_empty(uint8_t *data, size_t size) +{ + size_t i; + + for (i = 0; i < size && data[i] != '\n'; i++) + if (data[i] != ' ') + return 0; + + return i + 1; +} + +/* is_hrule • returns whether a line is a horizontal rule */ +static int +is_hrule(uint8_t *data, size_t size) +{ + size_t i = 0, n = 0; + uint8_t c; + + /* skipping initial spaces */ + if (size < 3) return 0; + if (data[0] == ' ') { i++; + if (data[1] == ' ') { i++; + if (data[2] == ' ') { i++; } } } + + /* looking at the hrule uint8_t */ + if (i + 2 >= size + || (data[i] != '*' && data[i] != '-' && data[i] != '_')) + return 0; + c = data[i]; + + /* the whole line must be the char or whitespace */ + while (i < size && data[i] != '\n') { + if (data[i] == c) n++; + else if (data[i] != ' ') + return 0; + + i++; + } + + return n >= 3; +} + +/* check if a line begins with a code fence; return the + * width of the code fence */ +static size_t +prefix_codefence(uint8_t *data, size_t size) +{ + size_t i = 0, n = 0; + uint8_t c; + + /* skipping initial spaces */ + if (size < 3) return 0; + if (data[0] == ' ') { i++; + if (data[1] == ' ') { i++; + if (data[2] == ' ') { i++; } } } + + /* looking at the hrule uint8_t */ + if (i + 2 >= size || !(data[i] == '~' || data[i] == '`')) + return 0; + + c = data[i]; + + /* the whole line must be the uint8_t or whitespace */ + while (i < size && data[i] == c) { + n++; i++; + } + + if (n < 3) + return 0; + + return i; +} + +/* check if a line is a code fence; return its size if it is */ +static size_t +is_codefence(uint8_t *data, size_t size, struct buf *syntax) +{ + size_t i = 0, syn_len = 0; + uint8_t *syn_start; + + i = prefix_codefence(data, size); + if (i == 0) + return 0; + + while (i < size && data[i] == ' ') + i++; + + syn_start = data + i; + + if (i < size && data[i] == '{') { + i++; syn_start++; + + while (i < size && data[i] != '}' && data[i] != '\n') { + syn_len++; i++; + } + + if (i == size || data[i] != '}') + return 0; + + /* strip all whitespace at the beginning and the end + * of the {} block */ + while (syn_len > 0 && _isspace(syn_start[0])) { + syn_start++; syn_len--; + } + + while (syn_len > 0 && _isspace(syn_start[syn_len - 1])) + syn_len--; + + i++; + } else { + while (i < size && !_isspace(data[i])) { + syn_len++; i++; + } + } + + if (syntax) { + syntax->data = syn_start; + syntax->size = syn_len; + } + + while (i < size && data[i] != '\n') { + if (!_isspace(data[i])) + return 0; + + i++; + } + + return i + 1; +} + +/* is_atxheader • returns whether the line is a hash-prefixed header */ +static int +is_atxheader(struct sd_markdown *rndr, uint8_t *data, size_t size) +{ + if (data[0] != '#') + return 0; + + if (rndr->ext_flags & MKDEXT_SPACE_HEADERS) { + size_t level = 0; + + while (level < size && level < 6 && data[level] == '#') + level++; + + if (level < size && data[level] != ' ') + return 0; + } + + return 1; +} + +/* is_headerline • returns whether the line is a setext-style hdr underline */ +static int +is_headerline(uint8_t *data, size_t size) +{ + size_t i = 0; + + /* test of level 1 header */ + if (data[i] == '=') { + for (i = 1; i < size && data[i] == '='; i++); + while (i < size && data[i] == ' ') i++; + return (i >= size || data[i] == '\n') ? 1 : 0; } + + /* test of level 2 header */ + if (data[i] == '-') { + for (i = 1; i < size && data[i] == '-'; i++); + while (i < size && data[i] == ' ') i++; + return (i >= size || data[i] == '\n') ? 2 : 0; } + + return 0; +} + +static int +is_next_headerline(uint8_t *data, size_t size) +{ + size_t i = 0; + + while (i < size && data[i] != '\n') + i++; + + if (++i >= size) + return 0; + + return is_headerline(data + i, size - i); +} + +/* prefix_quote • returns blockquote prefix length */ +static size_t +prefix_quote(uint8_t *data, size_t size) +{ + size_t i = 0; + if (i < size && data[i] == ' ') i++; + if (i < size && data[i] == ' ') i++; + if (i < size && data[i] == ' ') i++; + + if (i < size && data[i] == '>') { + if (i + 1 < size && data[i + 1] == ' ') + return i + 2; + + return i + 1; + } + + return 0; +} + +/* prefix_code • returns prefix length for block code*/ +static size_t +prefix_code(uint8_t *data, size_t size) +{ + if (size > 3 && data[0] == ' ' && data[1] == ' ' + && data[2] == ' ' && data[3] == ' ') return 4; + + return 0; +} + +/* prefix_oli • returns ordered list item prefix */ +static size_t +prefix_oli(uint8_t *data, size_t size) +{ + size_t i = 0; + + if (i < size && data[i] == ' ') i++; + if (i < size && data[i] == ' ') i++; + if (i < size && data[i] == ' ') i++; + + if (i >= size || data[i] < '0' || data[i] > '9') + return 0; + + while (i < size && data[i] >= '0' && data[i] <= '9') + i++; + + if (i + 1 >= size || data[i] != '.' || data[i + 1] != ' ') + return 0; + + if (is_next_headerline(data + i, size - i)) + return 0; + + return i + 2; +} + +/* prefix_uli • returns ordered list item prefix */ +static size_t +prefix_uli(uint8_t *data, size_t size) +{ + size_t i = 0; + + if (i < size && data[i] == ' ') i++; + if (i < size && data[i] == ' ') i++; + if (i < size && data[i] == ' ') i++; + + if (i + 1 >= size || + (data[i] != '*' && data[i] != '+' && data[i] != '-') || + data[i + 1] != ' ') + return 0; + + if (is_next_headerline(data + i, size - i)) + return 0; + + return i + 2; +} + + +/* parse_block • parsing of one block, returning next uint8_t to parse */ +static void parse_block(struct buf *ob, struct sd_markdown *rndr, + uint8_t *data, size_t size); + + +/* parse_blockquote • handles parsing of a blockquote fragment */ +static size_t +parse_blockquote(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size) +{ + size_t beg, end = 0, pre, work_size = 0; + uint8_t *work_data = 0; + struct buf *out = 0; + + out = rndr_newbuf(rndr, BUFFER_BLOCK); + beg = 0; + while (beg < size) { + for (end = beg + 1; end < size && data[end - 1] != '\n'; end++); + + pre = prefix_quote(data + beg, end - beg); + + if (pre) + beg += pre; /* skipping prefix */ + + /* empty line followed by non-quote line */ + else if (is_empty(data + beg, end - beg) && + (end >= size || (prefix_quote(data + end, size - end) == 0 && + !is_empty(data + end, size - end)))) + break; + + if (beg < end) { /* copy into the in-place working buffer */ + /* bufput(work, data + beg, end - beg); */ + if (!work_data) + work_data = data + beg; + else if (data + beg != work_data + work_size) + memmove(work_data + work_size, data + beg, end - beg); + work_size += end - beg; + } + beg = end; + } + + parse_block(out, rndr, work_data, work_size); + if (rndr->cb.blockquote) + rndr->cb.blockquote(ob, out, rndr->opaque); + rndr_popbuf(rndr, BUFFER_BLOCK); + return end; +} + +static size_t +parse_htmlblock(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size, int do_render); + +/* parse_blockquote • handles parsing of a regular paragraph */ +static size_t +parse_paragraph(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size) +{ + size_t i = 0, end = 0; + int level = 0; + struct buf work = { data, 0, 0, 0 }; + + while (i < size) { + for (end = i + 1; end < size && data[end - 1] != '\n'; end++) /* empty */; + + if (prefix_quote(data + i, end - i) != 0) { + end = i; + break; + } + + if (is_empty(data + i, size - i)) + break; + + if ((level = is_headerline(data + i, size - i)) != 0) + break; + + if (is_atxheader(rndr, data + i, size - i) || + is_hrule(data + i, size - i) || + prefix_quote(data + i, size - i)) { + end = i; + break; + } + + /* + * Early termination of a paragraph with the same logic + * as Markdown 1.0.0. If this logic is applied, the + * Markdown 1.0.3 test suite won't pass cleanly + * + * :: If the first character in a new line is not a letter, + * let's check to see if there's some kind of block starting + * here + */ + if ((rndr->ext_flags & MKDEXT_LAX_SPACING) && !isalnum(data[i])) { + if (prefix_oli(data + i, size - i) || + prefix_uli(data + i, size - i)) { + end = i; + break; + } + + /* see if an html block starts here */ + if (data[i] == '<' && rndr->cb.blockhtml && + parse_htmlblock(ob, rndr, data + i, size - i, 0)) { + end = i; + break; + } + + /* see if a code fence starts here */ + if ((rndr->ext_flags & MKDEXT_FENCED_CODE) != 0 && + is_codefence(data + i, size - i, NULL) != 0) { + end = i; + break; + } + } + + i = end; + } + + work.size = i; + while (work.size && data[work.size - 1] == '\n') + work.size--; + + if (!level) { + struct buf *tmp = rndr_newbuf(rndr, BUFFER_BLOCK); + parse_inline(tmp, rndr, work.data, work.size); + if (rndr->cb.paragraph) + rndr->cb.paragraph(ob, tmp, rndr->opaque); + rndr_popbuf(rndr, BUFFER_BLOCK); + } else { + struct buf *header_work; + + if (work.size) { + size_t beg; + i = work.size; + work.size -= 1; + + while (work.size && data[work.size] != '\n') + work.size -= 1; + + beg = work.size + 1; + while (work.size && data[work.size - 1] == '\n') + work.size -= 1; + + if (work.size > 0) { + struct buf *tmp = rndr_newbuf(rndr, BUFFER_BLOCK); + parse_inline(tmp, rndr, work.data, work.size); + + if (rndr->cb.paragraph) + rndr->cb.paragraph(ob, tmp, rndr->opaque); + + rndr_popbuf(rndr, BUFFER_BLOCK); + work.data += beg; + work.size = i - beg; + } + else work.size = i; + } + + header_work = rndr_newbuf(rndr, BUFFER_SPAN); + parse_inline(header_work, rndr, work.data, work.size); + + if (rndr->cb.header) + rndr->cb.header(ob, header_work, (int)level, rndr->opaque); + + rndr_popbuf(rndr, BUFFER_SPAN); + } + + return end; +} + +/* parse_fencedcode • handles parsing of a block-level code fragment */ +static size_t +parse_fencedcode(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size) +{ + size_t beg, end; + struct buf *work = 0; + struct buf lang = { 0, 0, 0, 0 }; + + beg = is_codefence(data, size, &lang); + if (beg == 0) return 0; + + work = rndr_newbuf(rndr, BUFFER_BLOCK); + + while (beg < size) { + size_t fence_end; + struct buf fence_trail = { 0, 0, 0, 0 }; + + fence_end = is_codefence(data + beg, size - beg, &fence_trail); + if (fence_end != 0 && fence_trail.size == 0) { + beg += fence_end; + break; + } + + for (end = beg + 1; end < size && data[end - 1] != '\n'; end++); + + if (beg < end) { + /* verbatim copy to the working buffer, + escaping entities */ + if (is_empty(data + beg, end - beg)) + bufputc(work, '\n'); + else bufput(work, data + beg, end - beg); + } + beg = end; + } + + if (work->size && work->data[work->size - 1] != '\n') + bufputc(work, '\n'); + + if (rndr->cb.blockcode) + rndr->cb.blockcode(ob, work, lang.size ? &lang : NULL, rndr->opaque); + + rndr_popbuf(rndr, BUFFER_BLOCK); + return beg; +} + +static size_t +parse_blockcode(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size) +{ + size_t beg, end, pre; + struct buf *work = 0; + + work = rndr_newbuf(rndr, BUFFER_BLOCK); + + beg = 0; + while (beg < size) { + for (end = beg + 1; end < size && data[end - 1] != '\n'; end++) {}; + pre = prefix_code(data + beg, end - beg); + + if (pre) + beg += pre; /* skipping prefix */ + else if (!is_empty(data + beg, end - beg)) + /* non-empty non-prefixed line breaks the pre */ + break; + + if (beg < end) { + /* verbatim copy to the working buffer, + escaping entities */ + if (is_empty(data + beg, end - beg)) + bufputc(work, '\n'); + else bufput(work, data + beg, end - beg); + } + beg = end; + } + + while (work->size && work->data[work->size - 1] == '\n') + work->size -= 1; + + bufputc(work, '\n'); + + if (rndr->cb.blockcode) + rndr->cb.blockcode(ob, work, NULL, rndr->opaque); + + rndr_popbuf(rndr, BUFFER_BLOCK); + return beg; +} + +/* parse_listitem • parsing of a single list item */ +/* assuming initial prefix is already removed */ +static size_t +parse_listitem(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size, int *flags) +{ + struct buf *work = 0, *inter = 0; + size_t beg = 0, end, pre, sublist = 0, orgpre = 0, i; + int in_empty = 0, has_inside_empty = 0, in_fence = 0; + + /* keeping track of the first indentation prefix */ + while (orgpre < 3 && orgpre < size && data[orgpre] == ' ') + orgpre++; + + beg = prefix_uli(data, size); + if (!beg) + beg = prefix_oli(data, size); + + if (!beg) + return 0; + + /* skipping to the beginning of the following line */ + end = beg; + while (end < size && data[end - 1] != '\n') + end++; + + /* getting working buffers */ + work = rndr_newbuf(rndr, BUFFER_SPAN); + inter = rndr_newbuf(rndr, BUFFER_SPAN); + + /* putting the first line into the working buffer */ + bufput(work, data + beg, end - beg); + beg = end; + + /* process the following lines */ + while (beg < size) { + size_t has_next_uli = 0, has_next_oli = 0; + + end++; + + while (end < size && data[end - 1] != '\n') + end++; + + /* process an empty line */ + if (is_empty(data + beg, end - beg)) { + in_empty = 1; + beg = end; + continue; + } + + /* calculating the indentation */ + i = 0; + while (i < 4 && beg + i < end && data[beg + i] == ' ') + i++; + + pre = i; + + if (rndr->ext_flags & MKDEXT_FENCED_CODE) { + if (is_codefence(data + beg + i, end - beg - i, NULL) != 0) + in_fence = !in_fence; + } + + /* Only check for new list items if we are **not** inside + * a fenced code block */ + if (!in_fence) { + has_next_uli = prefix_uli(data + beg + i, end - beg - i); + has_next_oli = prefix_oli(data + beg + i, end - beg - i); + } + + /* checking for ul/ol switch */ + if (in_empty && ( + ((*flags & MKD_LIST_ORDERED) && has_next_uli) || + (!(*flags & MKD_LIST_ORDERED) && has_next_oli))){ + *flags |= MKD_LI_END; + break; /* the following item must have same list type */ + } + + /* checking for a new item */ + if ((has_next_uli && !is_hrule(data + beg + i, end - beg - i)) || has_next_oli) { + if (in_empty) + has_inside_empty = 1; + + if (pre == orgpre) /* the following item must have */ + break; /* the same indentation */ + + if (!sublist) + sublist = work->size; + } + /* joining only indented stuff after empty lines; + * note that now we only require 1 space of indentation + * to continue a list */ + else if (in_empty && pre == 0) { + *flags |= MKD_LI_END; + break; + } + else if (in_empty) { + bufputc(work, '\n'); + has_inside_empty = 1; + } + + in_empty = 0; + + /* adding the line without prefix into the working buffer */ + bufput(work, data + beg + i, end - beg - i); + beg = end; + } + + /* render of li contents */ + if (has_inside_empty) + *flags |= MKD_LI_BLOCK; + + if (*flags & MKD_LI_BLOCK) { + /* intermediate render of block li */ + if (sublist && sublist < work->size) { + parse_block(inter, rndr, work->data, sublist); + parse_block(inter, rndr, work->data + sublist, work->size - sublist); + } + else + parse_block(inter, rndr, work->data, work->size); + } else { + /* intermediate render of inline li */ + if (sublist && sublist < work->size) { + parse_inline(inter, rndr, work->data, sublist); + parse_block(inter, rndr, work->data + sublist, work->size - sublist); + } + else + parse_inline(inter, rndr, work->data, work->size); + } + + /* render of li itself */ + if (rndr->cb.listitem) + rndr->cb.listitem(ob, inter, *flags, rndr->opaque); + + rndr_popbuf(rndr, BUFFER_SPAN); + rndr_popbuf(rndr, BUFFER_SPAN); + return beg; +} + + +/* parse_list • parsing ordered or unordered list block */ +static size_t +parse_list(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size, int flags) +{ + struct buf *work = 0; + size_t i = 0, j; + + work = rndr_newbuf(rndr, BUFFER_BLOCK); + + while (i < size) { + j = parse_listitem(work, rndr, data + i, size - i, &flags); + i += j; + + if (!j || (flags & MKD_LI_END)) + break; + } + + if (rndr->cb.list) + rndr->cb.list(ob, work, flags, rndr->opaque); + rndr_popbuf(rndr, BUFFER_BLOCK); + return i; +} + +/* parse_atxheader • parsing of atx-style headers */ +static size_t +parse_atxheader(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size) +{ + size_t level = 0; + size_t i, end, skip; + + while (level < size && level < 6 && data[level] == '#') + level++; + + for (i = level; i < size && data[i] == ' '; i++); + + for (end = i; end < size && data[end] != '\n'; end++); + skip = end; + + while (end && data[end - 1] == '#') + end--; + + while (end && data[end - 1] == ' ') + end--; + + if (end > i) { + struct buf *work = rndr_newbuf(rndr, BUFFER_SPAN); + + parse_inline(work, rndr, data + i, end - i); + + if (rndr->cb.header) + rndr->cb.header(ob, work, (int)level, rndr->opaque); + + rndr_popbuf(rndr, BUFFER_SPAN); + } + + return skip; +} + + +/* htmlblock_end • checking end of HTML block : [ \t]*\n[ \t*]\n */ +/* returns the length on match, 0 otherwise */ +static size_t +htmlblock_end_tag( + const char *tag, + size_t tag_len, + struct sd_markdown *rndr, + uint8_t *data, + size_t size) +{ + size_t i, w; + + /* checking if tag is a match */ + if (tag_len + 3 >= size || + strncasecmp((char *)data + 2, tag, tag_len) != 0 || + data[tag_len + 2] != '>') + return 0; + + /* checking white lines */ + i = tag_len + 3; + w = 0; + if (i < size && (w = is_empty(data + i, size - i)) == 0) + return 0; /* non-blank after tag */ + i += w; + w = 0; + + if (i < size) + w = is_empty(data + i, size - i); + + return i + w; +} + +static size_t +htmlblock_end(const char *curtag, + struct sd_markdown *rndr, + uint8_t *data, + size_t size, + int start_of_line) +{ + size_t tag_size = strlen(curtag); + size_t i = 1, end_tag; + int block_lines = 0; + + while (i < size) { + i++; + while (i < size && !(data[i - 1] == '<' && data[i] == '/')) { + if (data[i] == '\n') + block_lines++; + + i++; + } + + /* If we are only looking for unindented tags, skip the tag + * if it doesn't follow a newline. + * + * The only exception to this is if the tag is still on the + * initial line; in that case it still counts as a closing + * tag + */ + if (start_of_line && block_lines > 0 && data[i - 2] != '\n') + continue; + + if (i + 2 + tag_size >= size) + break; + + end_tag = htmlblock_end_tag(curtag, tag_size, rndr, data + i - 1, size - i + 1); + if (end_tag) + return i + end_tag - 1; + } + + return 0; +} + + +/* parse_htmlblock • parsing of inline HTML block */ +static size_t +parse_htmlblock(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size, int do_render) +{ + size_t i, j = 0, tag_end; + const char *curtag = NULL; + struct buf work = { data, 0, 0, 0 }; + + /* identification of the opening tag */ + if (size < 2 || data[0] != '<') + return 0; + + i = 1; + while (i < size && data[i] != '>' && data[i] != ' ') + i++; + + if (i < size) + curtag = find_block_tag((char *)data + 1, (int)i - 1); + + /* handling of special cases */ + if (!curtag) { + + /* HTML comment, laxist form */ + if (size > 5 && data[1] == '!' && data[2] == '-' && data[3] == '-') { + i = 5; + + while (i < size && !(data[i - 2] == '-' && data[i - 1] == '-' && data[i] == '>')) + i++; + + i++; + + if (i < size) + j = is_empty(data + i, size - i); + + if (j) { + work.size = i + j; + if (do_render && rndr->cb.blockhtml) + rndr->cb.blockhtml(ob, &work, rndr->opaque); + return work.size; + } + } + + /* HR, which is the only self-closing block tag considered */ + if (size > 4 && (data[1] == 'h' || data[1] == 'H') && (data[2] == 'r' || data[2] == 'R')) { + i = 3; + while (i < size && data[i] != '>') + i++; + + if (i + 1 < size) { + i++; + j = is_empty(data + i, size - i); + if (j) { + work.size = i + j; + if (do_render && rndr->cb.blockhtml) + rndr->cb.blockhtml(ob, &work, rndr->opaque); + return work.size; + } + } + } + + /* no special case recognised */ + return 0; + } + + /* looking for an unindented matching closing tag */ + /* followed by a blank line */ + tag_end = htmlblock_end(curtag, rndr, data, size, 1); + + /* if not found, trying a second pass looking for indented match */ + /* but not if tag is "ins" or "del" (following original Markdown.pl) */ + if (!tag_end && strcmp(curtag, "ins") != 0 && strcmp(curtag, "del") != 0) { + tag_end = htmlblock_end(curtag, rndr, data, size, 0); + } + + if (!tag_end) + return 0; + + /* the end of the block has been found */ + work.size = tag_end; + if (do_render && rndr->cb.blockhtml) + rndr->cb.blockhtml(ob, &work, rndr->opaque); + + return tag_end; +} + +static void +parse_table_row( + struct buf *ob, + struct sd_markdown *rndr, + uint8_t *data, + size_t size, + size_t columns, + int *col_data, + int header_flag) +{ + size_t i = 0, col, cols_left; + struct buf *row_work = 0; + + if (!rndr->cb.table_cell || !rndr->cb.table_row) + return; + + row_work = rndr_newbuf(rndr, BUFFER_SPAN); + + if (i < size && data[i] == '|') + i++; + + for (col = 0; col < columns && i < size; ++col) { + size_t cell_start, cell_end; + struct buf *cell_work; + + cell_work = rndr_newbuf(rndr, BUFFER_SPAN); + + while (i < size && _isspace(data[i])) + i++; + + cell_start = i; + + while (i < size && data[i] != '|') + i++; + + cell_end = i - 1; + + while (cell_end > cell_start && _isspace(data[cell_end])) + cell_end--; + + parse_inline(cell_work, rndr, data + cell_start, 1 + cell_end - cell_start); + rndr->cb.table_cell(row_work, cell_work, col_data[col] | header_flag, rndr->opaque, 0); + + rndr_popbuf(rndr, BUFFER_SPAN); + i++; + } + + cols_left = columns - col; + if (cols_left > 0) { + struct buf empty_cell = { 0, 0, 0, 0 }; + rndr->cb.table_cell(row_work, &empty_cell, col_data[col] | header_flag, rndr->opaque, cols_left); + } + + rndr->cb.table_row(ob, row_work, rndr->opaque); + + rndr_popbuf(rndr, BUFFER_SPAN); +} + +static size_t +parse_table_header( + struct buf *ob, + struct sd_markdown *rndr, + uint8_t *data, + size_t size, + size_t *columns, + int **column_data) +{ + int pipes; + size_t i = 0, col, header_end, under_end; + + pipes = 0; + while (i < size && data[i] != '\n') + if (data[i++] == '|') + pipes++; + + if (i == size || pipes == 0) + return 0; + + header_end = i; + + while (header_end > 0 && _isspace(data[header_end - 1])) + header_end--; + + if (data[0] == '|') + pipes--; + + if (header_end && data[header_end - 1] == '|') + pipes--; + + if (pipes + 1 > rndr->max_table_cols) + return 0; + + *columns = pipes + 1; + *column_data = calloc(*columns, sizeof(int)); + + /* Parse the header underline */ + i++; + if (i < size && data[i] == '|') + i++; + + under_end = i; + while (under_end < size && data[under_end] != '\n') + under_end++; + + for (col = 0; col < *columns && i < under_end; ++col) { + size_t dashes = 0; + + while (i < under_end && data[i] == ' ') + i++; + + if (data[i] == ':') { + i++; (*column_data)[col] |= MKD_TABLE_ALIGN_L; + dashes++; + } + + while (i < under_end && data[i] == '-') { + i++; dashes++; + } + + if (i < under_end && data[i] == ':') { + i++; (*column_data)[col] |= MKD_TABLE_ALIGN_R; + dashes++; + } + + while (i < under_end && data[i] == ' ') + i++; + + if (i < under_end && data[i] != '|') + break; + + if (dashes < 1) + break; + + i++; + } + + if (col < *columns) + return 0; + + parse_table_row( + ob, rndr, data, + header_end, + *columns, + *column_data, + MKD_TABLE_HEADER + ); + + return under_end + 1; +} + +static size_t +parse_table( + struct buf *ob, + struct sd_markdown *rndr, + uint8_t *data, + size_t size) +{ + size_t i; + + struct buf *header_work = 0; + struct buf *body_work = 0; + + size_t columns; + int *col_data = NULL; + + header_work = rndr_newbuf(rndr, BUFFER_SPAN); + body_work = rndr_newbuf(rndr, BUFFER_BLOCK); + + i = parse_table_header(header_work, rndr, data, size, &columns, &col_data); + if (i > 0) { + + while (i < size) { + size_t row_start; + int pipes = 0; + + row_start = i; + + while (i < size && data[i] != '\n') + if (data[i++] == '|') + pipes++; + + if (pipes == 0 || i == size) { + i = row_start; + break; + } + + parse_table_row( + body_work, + rndr, + data + row_start, + i - row_start, + columns, + col_data, 0 + ); + + i++; + } + + if (rndr->cb.table) + rndr->cb.table(ob, header_work, body_work, rndr->opaque); + } + + free(col_data); + rndr_popbuf(rndr, BUFFER_SPAN); + rndr_popbuf(rndr, BUFFER_BLOCK); + return i; +} + +/* parse_block • parsing of one block, returning next uint8_t to parse */ +static void +parse_block(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size) +{ + size_t beg, end, i; + uint8_t *txt_data; + beg = 0; + + if (rndr->work_bufs[BUFFER_SPAN].size + + rndr->work_bufs[BUFFER_BLOCK].size > rndr->max_nesting) + return; + + while (beg < size) { + txt_data = data + beg; + end = size - beg; + + if (is_atxheader(rndr, txt_data, end)) + beg += parse_atxheader(ob, rndr, txt_data, end); + + else if (data[beg] == '<' && rndr->cb.blockhtml && + (i = parse_htmlblock(ob, rndr, txt_data, end, 1)) != 0) + beg += i; + + else if ((i = is_empty(txt_data, end)) != 0) + beg += i; + + else if (is_hrule(txt_data, end)) { + if (rndr->cb.hrule) + rndr->cb.hrule(ob, rndr->opaque); + + while (beg < size && data[beg] != '\n') + beg++; + + beg++; + } + + else if ((rndr->ext_flags & MKDEXT_FENCED_CODE) != 0 && + (i = parse_fencedcode(ob, rndr, txt_data, end)) != 0) + beg += i; + + else if ((rndr->ext_flags & MKDEXT_TABLES) != 0 && + (i = parse_table(ob, rndr, txt_data, end)) != 0) + beg += i; + + else if (prefix_quote(txt_data, end)) + beg += parse_blockquote(ob, rndr, txt_data, end); + + else if (prefix_code(txt_data, end)) + beg += parse_blockcode(ob, rndr, txt_data, end); + + else if (prefix_uli(txt_data, end)) + beg += parse_list(ob, rndr, txt_data, end, 0); + + else if (prefix_oli(txt_data, end)) + beg += parse_list(ob, rndr, txt_data, end, MKD_LIST_ORDERED); + + else + beg += parse_paragraph(ob, rndr, txt_data, end); + } +} + + + +/********************* + * REFERENCE PARSING * + *********************/ + +/* is_ref • returns whether a line is a reference or not */ +static int +is_ref(const uint8_t *data, size_t beg, size_t end, size_t *last, struct link_ref **refs) +{ +/* int n; */ + size_t i = 0; + size_t id_offset, id_end; + size_t link_offset, link_end; + size_t title_offset, title_end; + size_t line_end; + + /* up to 3 optional leading spaces */ + if (beg + 3 >= end) return 0; + if (data[beg] == ' ') { i = 1; + if (data[beg + 1] == ' ') { i = 2; + if (data[beg + 2] == ' ') { i = 3; + if (data[beg + 3] == ' ') return 0; } } } + i += beg; + + /* id part: anything but a newline between brackets */ + if (data[i] != '[') return 0; + i++; + id_offset = i; + while (i < end && data[i] != '\n' && data[i] != '\r' && data[i] != ']') + i++; + if (i >= end || data[i] != ']') return 0; + id_end = i; + + /* spacer: colon (space | tab)* newline? (space | tab)* */ + i++; + if (i >= end || data[i] != ':') return 0; + i++; + while (i < end && data[i] == ' ') i++; + if (i < end && (data[i] == '\n' || data[i] == '\r')) { + i++; + if (i < end && data[i] == '\r' && data[i - 1] == '\n') i++; } + while (i < end && data[i] == ' ') i++; + if (i >= end) return 0; + + /* link: whitespace-free sequence, optionally between angle brackets */ + if (data[i] == '<') + i++; + + link_offset = i; + + while (i < end && data[i] != ' ' && data[i] != '\n' && data[i] != '\r') + i++; + + if (data[i - 1] == '>') link_end = i - 1; + else link_end = i; + + /* optional spacer: (space | tab)* (newline | '\'' | '"' | '(' ) */ + while (i < end && data[i] == ' ') i++; + if (i < end && data[i] != '\n' && data[i] != '\r' + && data[i] != '\'' && data[i] != '"' && data[i] != '(') + return 0; + line_end = 0; + /* computing end-of-line */ + if (i >= end || data[i] == '\r' || data[i] == '\n') line_end = i; + if (i + 1 < end && data[i] == '\n' && data[i + 1] == '\r') + line_end = i + 1; + + /* optional (space|tab)* spacer after a newline */ + if (line_end) { + i = line_end + 1; + while (i < end && data[i] == ' ') i++; } + + /* optional title: any non-newline sequence enclosed in '"() + alone on its line */ + title_offset = title_end = 0; + if (i + 1 < end + && (data[i] == '\'' || data[i] == '"' || data[i] == '(')) { + i++; + title_offset = i; + /* looking for EOL */ + while (i < end && data[i] != '\n' && data[i] != '\r') i++; + if (i + 1 < end && data[i] == '\n' && data[i + 1] == '\r') + title_end = i + 1; + else title_end = i; + /* stepping back */ + i -= 1; + while (i > title_offset && data[i] == ' ') + i -= 1; + if (i > title_offset + && (data[i] == '\'' || data[i] == '"' || data[i] == ')')) { + line_end = title_end; + title_end = i; } } + + if (!line_end || link_end == link_offset) + return 0; /* garbage after the link empty link */ + + /* a valid ref has been found, filling-in return structures */ + if (last) + *last = line_end; + + if (refs) { + struct link_ref *ref; + + ref = add_link_ref(refs, data + id_offset, id_end - id_offset); + if (!ref) + return 0; + + ref->link = bufnew(link_end - link_offset); + bufput(ref->link, data + link_offset, link_end - link_offset); + + if (title_end > title_offset) { + ref->title = bufnew(title_end - title_offset); + bufput(ref->title, data + title_offset, title_end - title_offset); + } + } + + return 1; +} + +static void expand_tabs(struct buf *ob, const uint8_t *line, size_t size) +{ + size_t i = 0, tab = 0; + + while (i < size) { + size_t org = i; + + while (i < size && line[i] != '\t') { + i++; tab++; + } + + if (i > org) + bufput(ob, line + org, i - org); + + if (i >= size) + break; + + do { + bufputc(ob, ' '); tab++; + } while (tab % 4); + + i++; + } +} + +/********************** + * EXPORTED FUNCTIONS * + **********************/ + +struct sd_markdown * +sd_markdown_new( + unsigned int extensions, + size_t max_nesting, + size_t max_table_cols, + const struct sd_callbacks *callbacks, + void *opaque) +{ + struct sd_markdown *md = NULL; + + assert(max_nesting > 0 && max_table_cols > 0 && callbacks); + + md = malloc(sizeof(struct sd_markdown)); + if (!md) + return NULL; + + memcpy(&md->cb, callbacks, sizeof(struct sd_callbacks)); + + stack_init(&md->work_bufs[BUFFER_BLOCK], 4); + stack_init(&md->work_bufs[BUFFER_SPAN], 8); + + memset(md->active_char, 0x0, 256); + + if (md->cb.emphasis || md->cb.double_emphasis || md->cb.triple_emphasis) { + md->active_char['*'] = MD_CHAR_EMPHASIS; + md->active_char['_'] = MD_CHAR_EMPHASIS; + if (extensions & MKDEXT_STRIKETHROUGH) + md->active_char['~'] = MD_CHAR_EMPHASIS; + } + + if (md->cb.codespan) + md->active_char['`'] = MD_CHAR_CODESPAN; + + if (md->cb.linebreak) + md->active_char['\n'] = MD_CHAR_LINEBREAK; + + if (md->cb.image || md->cb.link) + md->active_char['['] = MD_CHAR_LINK; + + md->active_char['<'] = MD_CHAR_LANGLE; + md->active_char['\\'] = MD_CHAR_ESCAPE; + md->active_char['&'] = MD_CHAR_ENTITITY; + + if (extensions & MKDEXT_AUTOLINK) { + if (!(extensions & MKDEXT_NO_EMAIL_AUTOLINK)) + md->active_char['@'] = MD_CHAR_AUTOLINK_EMAIL; + md->active_char[':'] = MD_CHAR_AUTOLINK_URL; + md->active_char['w'] = MD_CHAR_AUTOLINK_WWW; + md->active_char['/'] = MD_CHAR_AUTOLINK_SUBREDDIT_OR_USERNAME; + } + + if (extensions & MKDEXT_SUPERSCRIPT) + md->active_char['^'] = MD_CHAR_SUPERSCRIPT; + + /* Extension data */ + md->ext_flags = extensions; + md->opaque = opaque; + md->max_nesting = max_nesting; + md->max_table_cols = max_table_cols; + md->in_link_body = 0; + + return md; +} + +void +sd_markdown_render(struct buf *ob, const uint8_t *document, size_t doc_size, struct sd_markdown *md) +{ +#define MARKDOWN_GROW(x) ((x) + ((x) >> 1)) + static const char UTF8_BOM[] = {0xEF, 0xBB, 0xBF}; + + struct buf *text; + size_t beg, end; + + text = bufnew(64); + if (!text) + return; + + /* Preallocate enough space for our buffer to avoid expanding while copying */ + bufgrow(text, doc_size); + + /* reset the references table */ + memset(&md->refs, 0x0, REF_TABLE_SIZE * sizeof(void *)); + + /* first pass: looking for references, copying everything else */ + beg = 0; + + /* Skip a possible UTF-8 BOM, even though the Unicode standard + * discourages having these in UTF-8 documents */ + if (doc_size >= 3 && memcmp(document, UTF8_BOM, 3) == 0) + beg += 3; + + while (beg < doc_size) /* iterating over lines */ + if (is_ref(document, beg, doc_size, &end, md->refs)) + beg = end; + else { /* skipping to the next line */ + end = beg; + while (end < doc_size && document[end] != '\n' && document[end] != '\r') + end++; + + /* adding the line body if present */ + if (end > beg) + expand_tabs(text, document + beg, end - beg); + + while (end < doc_size && (document[end] == '\n' || document[end] == '\r')) { + /* add one \n per newline */ + if (document[end] == '\n' || (end + 1 < doc_size && document[end + 1] != '\n')) + bufputc(text, '\n'); + end++; + } + + beg = end; + } + + /* pre-grow the output buffer to minimize allocations */ + bufgrow(ob, MARKDOWN_GROW(text->size)); + + /* second pass: actual rendering */ + if (md->cb.doc_header) + md->cb.doc_header(ob, md->opaque); + + if (text->size) { + /* adding a final newline if not already present */ + if (text->data[text->size - 1] != '\n' && text->data[text->size - 1] != '\r') + bufputc(text, '\n'); + + parse_block(ob, md, text->data, text->size); + } + + if (md->cb.doc_footer) + md->cb.doc_footer(ob, md->opaque); + + /* clean-up */ + bufrelease(text); + free_link_refs(md->refs); + + assert(md->work_bufs[BUFFER_SPAN].size == 0); + assert(md->work_bufs[BUFFER_BLOCK].size == 0); +} + +void +sd_markdown_free(struct sd_markdown *md) +{ + size_t i; + + for (i = 0; i < (size_t)md->work_bufs[BUFFER_SPAN].asize; ++i) + bufrelease(md->work_bufs[BUFFER_SPAN].item[i]); + + for (i = 0; i < (size_t)md->work_bufs[BUFFER_BLOCK].asize; ++i) + bufrelease(md->work_bufs[BUFFER_BLOCK].item[i]); + + stack_free(&md->work_bufs[BUFFER_SPAN]); + stack_free(&md->work_bufs[BUFFER_BLOCK]); + + free(md); +} + +void +sd_version(int *ver_major, int *ver_minor, int *ver_revision) +{ + *ver_major = SUNDOWN_VER_MAJOR; + *ver_minor = SUNDOWN_VER_MINOR; + *ver_revision = SUNDOWN_VER_REVISION; +} + +/* vim: set filetype=c: */ diff --git a/SnudownTest/markdown.h b/SnudownTest/markdown.h new file mode 100644 index 0000000..00d50dc --- /dev/null +++ b/SnudownTest/markdown.h @@ -0,0 +1,140 @@ +/* markdown.h - generic markdown parser */ + +/* + * Copyright (c) 2009, Natacha Porté + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef UPSKIRT_MARKDOWN_H +#define UPSKIRT_MARKDOWN_H + +#include "buffer.h" +#include "autolink.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define SUNDOWN_VERSION "1.16.0" +#define SUNDOWN_VER_MAJOR 1 +#define SUNDOWN_VER_MINOR 16 +#define SUNDOWN_VER_REVISION 0 + +/******************** + * TYPE DEFINITIONS * + ********************/ + +/* mkd_autolink - type of autolink */ +enum mkd_autolink { + MKDA_NOT_AUTOLINK, /* used internally when it is not an autolink*/ + MKDA_NORMAL, /* normal http/http/ftp/mailto/etc link */ + MKDA_EMAIL, /* e-mail link without explit mailto: */ +}; + +enum mkd_tableflags { + MKD_TABLE_ALIGN_L = 1, + MKD_TABLE_ALIGN_R = 2, + MKD_TABLE_ALIGN_CENTER = 3, + MKD_TABLE_ALIGNMASK = 3, + MKD_TABLE_HEADER = 4 +}; + +enum mkd_extensions { + MKDEXT_NO_INTRA_EMPHASIS = (1 << 0), + MKDEXT_TABLES = (1 << 1), + MKDEXT_FENCED_CODE = (1 << 2), + MKDEXT_AUTOLINK = (1 << 3), + MKDEXT_STRIKETHROUGH = (1 << 4), + MKDEXT_SPACE_HEADERS = (1 << 6), + MKDEXT_SUPERSCRIPT = (1 << 7), + MKDEXT_LAX_SPACING = (1 << 8), + MKDEXT_NO_EMAIL_AUTOLINK = (1 << 9), +}; + +/* sd_callbacks - functions for rendering parsed data */ +struct sd_callbacks { + /* block level callbacks - NULL skips the block */ + void (*blockcode)(struct buf *ob, const struct buf *text, const struct buf *lang, void *opaque); + void (*blockquote)(struct buf *ob, const struct buf *text, void *opaque); + void (*blockhtml)(struct buf *ob,const struct buf *text, void *opaque); + void (*header)(struct buf *ob, const struct buf *text, int level, void *opaque); + void (*hrule)(struct buf *ob, void *opaque); + void (*list)(struct buf *ob, const struct buf *text, int flags, void *opaque); + void (*listitem)(struct buf *ob, const struct buf *text, int flags, void *opaque); + void (*paragraph)(struct buf *ob, const struct buf *text, void *opaque); + void (*table)(struct buf *ob, const struct buf *header, const struct buf *body, void *opaque); + void (*table_row)(struct buf *ob, const struct buf *text, void *opaque); + void (*table_cell)(struct buf *ob, const struct buf *text, int flags, void *opaque, int col_span); + + + /* span level callbacks - NULL or return 0 prints the span verbatim */ + int (*autolink)(struct buf *ob, const struct buf *link, enum mkd_autolink type, void *opaque); + int (*codespan)(struct buf *ob, const struct buf *text, void *opaque); + int (*double_emphasis)(struct buf *ob, const struct buf *text, void *opaque); + int (*emphasis)(struct buf *ob, const struct buf *text, void *opaque); + int (*image)(struct buf *ob, const struct buf *link, const struct buf *title, const struct buf *alt, void *opaque); + int (*linebreak)(struct buf *ob, void *opaque); + int (*link)(struct buf *ob, const struct buf *link, const struct buf *title, const struct buf *content, void *opaque); + int (*raw_html_tag)(struct buf *ob, const struct buf *tag, void *opaque); + int (*triple_emphasis)(struct buf *ob, const struct buf *text, void *opaque); + int (*strikethrough)(struct buf *ob, const struct buf *text, void *opaque); + int (*superscript)(struct buf *ob, const struct buf *text, void *opaque); + + /* low level callbacks - NULL copies input directly into the output */ + void (*entity)(struct buf *ob, const struct buf *entity, void *opaque); + void (*normal_text)(struct buf *ob, const struct buf *text, void *opaque); + + /* header and footer */ + void (*doc_header)(struct buf *ob, void *opaque); + void (*doc_footer)(struct buf *ob, void *opaque); +}; + +struct sd_markdown; + +/********* + * FLAGS * + *********/ + +/* list/listitem flags */ +#define MKD_LIST_ORDERED 1 +#define MKD_LI_BLOCK 2 /*
  • containing block data */ + +/********************** + * EXPORTED FUNCTIONS * + **********************/ + +extern struct sd_markdown * +sd_markdown_new( + unsigned int extensions, + size_t max_nesting, + size_t max_table_cols, + const struct sd_callbacks *callbacks, + void *opaque); + +extern void +sd_markdown_render(struct buf *ob, const uint8_t *document, size_t doc_size, struct sd_markdown *md); + +extern void +sd_markdown_free(struct sd_markdown *md); + +extern void +sd_version(int *major, int *minor, int *revision); + +#ifdef __cplusplus +} +#endif + +#endif + +/* vim: set filetype=c: */ diff --git a/SnudownTest/setup.py b/SnudownTest/setup.py new file mode 100644 index 0000000..ffec52d --- /dev/null +++ b/SnudownTest/setup.py @@ -0,0 +1,56 @@ +from distutils.spawn import find_executable +from setuptools import setup, Extension +from setuptools.command.build_ext import build_ext + +import re +import os +import subprocess +import fnmatch + +def c_files_in(directory): + paths = [] + names = os.listdir(directory) + for f in fnmatch.filter(names, '*.c'): + paths.append(os.path.join(directory, f)) + return paths + + +def process_gperf_file(gperf_file, output_file): + if not find_executable("gperf"): + raise Exception("Couldn't find `gperf`, is it installed?") + assert os.path.exists(gperf_file) + command = "gperf.exe "+ gperf_file+ " --output-file=" + output_file + print(command) + #subprocess.check_call(command) + +version = None +version_re = re.compile(r'^#define\s+SNUDOWN_VERSION\s+"([^"]+)"$') +with open('snudown.c', 'r') as f: + for line in f: + m = version_re.match(line) + if m: + version = m.group(1) +assert version + + +class GPerfingBuildExt(build_ext): + def run(self): + process_gperf_file("src\\html_entities.gperf", "src\\html_entities.h") + build_ext.run(self) + +setup( + name='snudown', + version=version, + author='Vicent Marti', + author_email='vicent@github.com', + license='MIT', + test_suite="test_snudown.test_snudown", + cmdclass={'build_ext': GPerfingBuildExt,}, + ext_modules=[ + Extension( + name='snudown', + sources=['snudown.c'] + c_files_in('src/') + c_files_in('html/'), + include_dirs=['src', 'html'] + ) + ], +) diff --git a/SnudownTest/snudown - Copy.c b/SnudownTest/snudown - Copy.c new file mode 100644 index 0000000..e268f66 --- /dev/null +++ b/SnudownTest/snudown - Copy.c @@ -0,0 +1,212 @@ +#define PY_SSIZE_T_CLEAN +#include + +#include "markdown.h" +#include "html.h" +#include "autolink.h" + +#define SNUDOWN_VERSION "1.4.0" + +enum snudown_renderer_mode { + RENDERER_USERTEXT = 0, + RENDERER_WIKI, + RENDERER_COUNT +}; + +struct snudown_renderopt { + struct html_renderopt html; + int nofollow; + const char *target; +}; + +struct snudown_renderer { + struct sd_markdown* main_renderer; + struct sd_markdown* toc_renderer; + struct module_state* state; + struct module_state* toc_state; +}; + +struct module_state { + struct sd_callbacks callbacks; + struct snudown_renderopt options; +}; + +static struct snudown_renderer sundown[RENDERER_COUNT]; + +static char* html_element_whitelist[] = {"tr", "th", "td", "table", "tbody", "thead", "tfoot", "caption", NULL}; +static char* html_attr_whitelist[] = {"colspan", "rowspan", "cellspacing", "cellpadding", "scope", NULL}; + +static struct module_state usertext_toc_state; +static struct module_state wiki_toc_state; +static struct module_state usertext_state; +static struct module_state wiki_state; + +/* The module doc strings */ +PyDoc_STRVAR(snudown_module__doc__, "When does the narwhal bacon? At Sundown."); +PyDoc_STRVAR(snudown_md__doc__, "Render a Markdown document"); + +static const unsigned int snudown_default_md_flags = + MKDEXT_NO_INTRA_EMPHASIS | + MKDEXT_SUPERSCRIPT | + MKDEXT_AUTOLINK | + MKDEXT_STRIKETHROUGH | + MKDEXT_TABLES; + +static const unsigned int snudown_default_render_flags = + HTML_SKIP_HTML | + HTML_SKIP_IMAGES | + HTML_SAFELINK | + HTML_ESCAPE | + HTML_USE_XHTML; + +static const unsigned int snudown_wiki_render_flags = + HTML_SKIP_HTML | + HTML_SAFELINK | + HTML_ALLOW_ELEMENT_WHITELIST | + HTML_ESCAPE | + HTML_USE_XHTML; + +static void +snudown_link_attr(struct buf *ob, const struct buf *link, void *opaque) +{ + struct snudown_renderopt *options = opaque; + + if (options->nofollow) + BUFPUTSL(ob, " rel=\"nofollow\""); + + if (options->target != NULL) { + BUFPUTSL(ob, " target=\""); + bufputs(ob, options->target); + bufputc(ob, '\"'); + } +} + +static struct sd_markdown* make_custom_renderer(struct module_state* state, + const unsigned int renderflags, + const unsigned int markdownflags, + int toc_renderer) { + if(toc_renderer) { + sdhtml_toc_renderer(&state->callbacks, + (struct html_renderopt *)&state->options); + } else { + sdhtml_renderer(&state->callbacks, + (struct html_renderopt *)&state->options, + renderflags); + } + + state->options.html.link_attributes = &snudown_link_attr; + state->options.html.html_element_whitelist = html_element_whitelist; + state->options.html.html_attr_whitelist = html_attr_whitelist; + + return sd_markdown_new( + markdownflags, + 16, + 64, + &state->callbacks, + &state->options + ); +} + +void init_default_renderer(PyObject *module) { + PyModule_AddIntConstant(module, "RENDERER_USERTEXT", RENDERER_USERTEXT); + sundown[RENDERER_USERTEXT].main_renderer = make_custom_renderer(&usertext_state, snudown_default_render_flags, snudown_default_md_flags, 0); + sundown[RENDERER_USERTEXT].toc_renderer = make_custom_renderer(&usertext_toc_state, snudown_default_render_flags, snudown_default_md_flags, 1); + sundown[RENDERER_USERTEXT].state = &usertext_state; + sundown[RENDERER_USERTEXT].toc_state = &usertext_toc_state; +} + +void init_wiki_renderer(PyObject *module) { + PyModule_AddIntConstant(module, "RENDERER_WIKI", RENDERER_WIKI); + sundown[RENDERER_WIKI].main_renderer = make_custom_renderer(&wiki_state, snudown_wiki_render_flags, snudown_default_md_flags, 0); + sundown[RENDERER_WIKI].toc_renderer = make_custom_renderer(&wiki_toc_state, snudown_wiki_render_flags, snudown_default_md_flags, 1); + sundown[RENDERER_WIKI].state = &wiki_state; + sundown[RENDERER_WIKI].toc_state = &wiki_toc_state; +} + +static PyObject * +snudown_md(PyObject *self, PyObject *args, PyObject *kwargs) +{ + static char *kwlist[] = {"text", "nofollow", "target", "toc_id_prefix", "renderer", "enable_toc", NULL}; + + struct buf ib, *ob; + PyObject *py_result; + const char* result_text; + int renderer = RENDERER_USERTEXT; + int enable_toc = 0; + struct snudown_renderer _snudown; + int nofollow = 0; + char* target = NULL; + char* toc_id_prefix = NULL; + unsigned int flags; + + memset(&ib, 0x0, sizeof(struct buf)); + + /* Parse arguments */ + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|izzii", kwlist, + &ib.data, &ib.size, &nofollow, + &target, &toc_id_prefix, &renderer, &enable_toc)) { + return NULL; + } + + if (renderer < 0 || renderer >= RENDERER_COUNT) { + PyErr_SetString(PyExc_ValueError, "Invalid renderer"); + return NULL; + } + + _snudown = sundown[renderer]; + + struct snudown_renderopt *options = &(_snudown.state->options); + options->nofollow = nofollow; + options->target = target; + + /* Output buffer */ + ob = bufnew(128); + + flags = options->html.flags; + + if (enable_toc) { + _snudown.toc_state->options.html.toc_id_prefix = toc_id_prefix; + sd_markdown_render(ob, ib.data, ib.size, _snudown.toc_renderer); + _snudown.toc_state->options.html.toc_id_prefix = NULL; + + options->html.flags |= HTML_TOC; + } + + options->html.toc_id_prefix = toc_id_prefix; + + /* do the magic */ + sd_markdown_render(ob, ib.data, ib.size, _snudown.main_renderer); + + options->html.toc_id_prefix = NULL; + options->html.flags = flags; + + /* make a Python string */ + result_text = ""; + if (ob->data) + result_text = (const char*)ob->data; + py_result = Py_BuildValue("s#", result_text, (int)ob->size); + + /* Cleanup */ + bufrelease(ob); + return py_result; +} + +static PyMethodDef snudown_methods[] = { + {"markdown", (PyCFunction) snudown_md, METH_VARARGS | METH_KEYWORDS, snudown_md__doc__}, + {NULL, NULL, 0, NULL} /* Sentinel */ +}; + +PyMODINIT_FUNC initsnudown(void) +{ + PyObject *module; + + module = Py_InitModule3("snudown", snudown_methods, snudown_module__doc__); + if (module == NULL) + return; + + init_default_renderer(module); + init_wiki_renderer(module); + + /* Version */ + PyModule_AddStringConstant(module, "__version__", SNUDOWN_VERSION); +} diff --git a/SnudownTest/snudown-validator.c b/SnudownTest/snudown-validator.c new file mode 100644 index 0000000..153e1c4 --- /dev/null +++ b/SnudownTest/snudown-validator.c @@ -0,0 +1,226 @@ +#include "markdown.h" +#include "html.h" +#include "buffer.h" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#define READ_UNIT 1024 +#define OUTPUT_UNIT 64 + +#include "autolink.h" + +#define SNUDOWN_VERSION "1.3.2" + +enum snudown_renderer_mode { + RENDERER_USERTEXT = 0, + RENDERER_WIKI, + RENDERER_COUNT +}; + +struct snudown_renderopt { + struct html_renderopt html; + int nofollow; + const char *target; +}; + +struct snudown_renderer { + struct sd_markdown* main_renderer; + struct sd_markdown* toc_renderer; + struct module_state* state; + struct module_state* toc_state; +}; + +struct module_state { + struct sd_callbacks callbacks; + struct snudown_renderopt options; +}; + +static struct snudown_renderer sundown[RENDERER_COUNT]; + +static char* html_element_whitelist[] = {"tr", "th", "td", "table", "tbody", "thead", "tfoot", "caption", NULL}; +static char* html_attr_whitelist[] = {"colspan", "rowspan", "cellspacing", "cellpadding", "scope", NULL}; + +static struct module_state usertext_toc_state; +static struct module_state wiki_toc_state; +static struct module_state usertext_state; +static struct module_state wiki_state; + +static const unsigned int snudown_default_md_flags = + MKDEXT_NO_INTRA_EMPHASIS | + MKDEXT_SUPERSCRIPT | + MKDEXT_AUTOLINK | + MKDEXT_STRIKETHROUGH | + MKDEXT_TABLES; + +static const unsigned int snudown_default_render_flags = + HTML_SKIP_HTML | + HTML_SKIP_IMAGES | + HTML_SAFELINK | + HTML_ESCAPE | + HTML_USE_XHTML; + +static const unsigned int snudown_wiki_render_flags = + HTML_SKIP_HTML | + HTML_SAFELINK | + HTML_ALLOW_ELEMENT_WHITELIST | + HTML_ESCAPE | + HTML_USE_XHTML; + +static void +snudown_link_attr(struct buf *ob, const struct buf *link, void *opaque) +{ + struct snudown_renderopt *options = opaque; + + if (options->nofollow) + BUFPUTSL(ob, " rel=\"nofollow\""); + + if (options->target != NULL) { + BUFPUTSL(ob, " target=\""); + bufputs(ob, options->target); + bufputc(ob, '\"'); + } +} + +static struct sd_markdown* make_custom_renderer(struct module_state* state, + const unsigned int renderflags, + const unsigned int markdownflags, + int toc_renderer) { + if(toc_renderer) { + sdhtml_toc_renderer(&state->callbacks, + (struct html_renderopt *)&state->options); + } else { + sdhtml_renderer(&state->callbacks, + (struct html_renderopt *)&state->options, + renderflags); + } + + state->options.html.link_attributes = &snudown_link_attr; + state->options.html.html_element_whitelist = html_element_whitelist; + state->options.html.html_attr_whitelist = html_attr_whitelist; + + return sd_markdown_new( + markdownflags, + 16, + 64, + &state->callbacks, + &state->options + ); +} + +void init_default_renderer() { + sundown[RENDERER_USERTEXT].main_renderer = make_custom_renderer(&usertext_state, snudown_default_render_flags, snudown_default_md_flags, 0); + sundown[RENDERER_USERTEXT].toc_renderer = make_custom_renderer(&usertext_toc_state, snudown_default_render_flags, snudown_default_md_flags, 1); + sundown[RENDERER_USERTEXT].state = &usertext_state; + sundown[RENDERER_USERTEXT].toc_state = &usertext_toc_state; +} + +void init_wiki_renderer() { + sundown[RENDERER_WIKI].main_renderer = make_custom_renderer(&wiki_state, snudown_wiki_render_flags, snudown_default_md_flags, 0); + sundown[RENDERER_WIKI].toc_renderer = make_custom_renderer(&wiki_toc_state, snudown_wiki_render_flags, snudown_default_md_flags, 1); + sundown[RENDERER_WIKI].state = &wiki_state; + sundown[RENDERER_WIKI].toc_state = &wiki_toc_state; +} + +void +snudown_md(struct buf *ob, const uint8_t *document, size_t doc_size, int wiki_mode) +{ + int renderer = RENDERER_USERTEXT; + int enable_toc = 0; + struct snudown_renderer _snudown; + int nofollow = 0; + char* target = NULL; + char* toc_id_prefix = NULL; + unsigned int flags; + + if (wiki_mode) + renderer = RENDERER_WIKI; + + _snudown = sundown[renderer]; + + struct snudown_renderopt *options = &(_snudown.state->options); + options->nofollow = nofollow; + options->target = target; + + flags = options->html.flags; + + if (enable_toc) { + _snudown.toc_state->options.html.toc_id_prefix = toc_id_prefix; + sd_markdown_render(ob, document, doc_size, _snudown.toc_renderer); + _snudown.toc_state->options.html.toc_id_prefix = NULL; + + options->html.flags |= HTML_TOC; + } + + options->html.toc_id_prefix = toc_id_prefix; + + /* do the magic */ + sd_markdown_render(ob, document, doc_size, _snudown.main_renderer); + + options->html.toc_id_prefix = NULL; + options->html.flags = flags; +} +int +main(int argc, char **argv) +{ + init_default_renderer(); + init_wiki_renderer(); + + struct buf *ib, *ob; + int size_read = 0, wiki_mode = 0, i = 0, have_errors = 0; + + /* reading everything */ + ib = bufnew(READ_UNIT); + bufgrow(ib, READ_UNIT); + while ((size_read = fread(ib->data + ib->size, 1, ib->asize - ib->size, stdin)) > 0) { + ib->size += size_read; + bufgrow(ib, ib->size + READ_UNIT); + } + /* Render to a buffer, then print that out */ + ob = bufnew(OUTPUT_UNIT); + bufputs(ob, "\n"); + snudown_md(ob, ib->data, ib->size, wiki_mode); + bufputs(ob, "\n"); + + // Wiki mode explicitly allows unbalanced tags, need some way to exclude those + if (!wiki_mode) { + GumboOutput* output = gumbo_parse_with_options(&kGumboDefaultOptions, bufcstr(ob), ob->size); + + for (i=0; i < output->errors.length; ++i) { + // stupid "public" API I hacked in. + void* thing = output->errors.data[i]; + GumboErrorType type = gumbo_get_error_type(thing); + switch(type) { + case GUMBO_ERR_UTF8_INVALID: + case GUMBO_ERR_UTF8_NULL: + // Making sure the user gave us valid + // utf-8 or transforming it to valid + // utf-8 is outside the scope of snudown + continue; + default: + have_errors = 1; + printf("%s\n", GUMBO_ERROR_NAMES[type]); + printf("%s\n",gumbo_get_error_text(thing)); + printf("===============\n"); + break; + } + } + + if (have_errors) { + // gotta trigger a crash for AFL to catch it + assert(0); + } + + gumbo_destroy_output(&kGumboDefaultOptions, output); + } + bufrelease(ob); + bufrelease(ib); + return 0; +} diff --git a/SnudownTest/snudown.c b/SnudownTest/snudown.c new file mode 100644 index 0000000..8cbacaf --- /dev/null +++ b/SnudownTest/snudown.c @@ -0,0 +1,232 @@ +#define PY_SSIZE_T_CLEAN +#include + +#include "markdown.h" +#include "html.h" +#include "autolink.h" + +#define SNUDOWN_VERSION "1.4.0" + +enum snudown_renderer_mode { + RENDERER_USERTEXT = 0, + RENDERER_WIKI, + RENDERER_COUNT +}; + +struct snudown_renderopt { + struct html_renderopt html; + int nofollow; + const char *target; +}; + +struct snudown_renderer { + struct sd_markdown* main_renderer; + struct sd_markdown* toc_renderer; + struct module_state* state; + struct module_state* toc_state; +}; + +struct module_state { + struct sd_callbacks callbacks; + struct snudown_renderopt options; +}; + +static struct snudown_renderer sundown[RENDERER_COUNT]; + +static char* html_element_whitelist[] = {"tr", "th", "td", "table", "tbody", "thead", "tfoot", "caption", NULL}; +static char* html_attr_whitelist[] = {"colspan", "rowspan", "cellspacing", "cellpadding", "scope", NULL}; + +static struct module_state usertext_toc_state; +static struct module_state wiki_toc_state; +static struct module_state usertext_state; +static struct module_state wiki_state; + +/* The module doc strings */ +PyDoc_STRVAR(snudown_module__doc__, "When does the narwhal bacon? At Sundown."); +PyDoc_STRVAR(snudown_md__doc__, "Render a Markdown document"); + +static const unsigned int snudown_default_md_flags = + MKDEXT_NO_INTRA_EMPHASIS | + MKDEXT_SUPERSCRIPT | + MKDEXT_AUTOLINK | + MKDEXT_STRIKETHROUGH | + MKDEXT_TABLES; + +static const unsigned int snudown_default_render_flags = + HTML_SKIP_HTML | + HTML_SKIP_IMAGES | + HTML_SAFELINK | + HTML_ESCAPE | + HTML_USE_XHTML; + +static const unsigned int snudown_wiki_render_flags = + HTML_SKIP_HTML | + HTML_SAFELINK | + HTML_ALLOW_ELEMENT_WHITELIST | + HTML_ESCAPE | + HTML_USE_XHTML; + +static void +snudown_link_attr(struct buf *ob, const struct buf *link, void *opaque) +{ + struct snudown_renderopt *options = opaque; + + if (options->nofollow) + BUFPUTSL(ob, " rel=\"nofollow\""); + + if (options->target != NULL) { + BUFPUTSL(ob, " target=\""); + bufputs(ob, options->target); + bufputc(ob, '\"'); + } +} + +static struct sd_markdown* make_custom_renderer(struct module_state* state, + const unsigned int renderflags, + const unsigned int markdownflags, + int toc_renderer) { + if(toc_renderer) { + sdhtml_toc_renderer(&state->callbacks, + (struct html_renderopt *)&state->options); + } else { + sdhtml_renderer(&state->callbacks, + (struct html_renderopt *)&state->options, + renderflags); + } + + state->options.html.link_attributes = &snudown_link_attr; + state->options.html.html_element_whitelist = html_element_whitelist; + state->options.html.html_attr_whitelist = html_attr_whitelist; + + return sd_markdown_new( + markdownflags, + 16, + 64, + &state->callbacks, + &state->options + ); +} + +void init_default_renderer(PyObject *module) { + PyModule_AddIntConstant(module, "RENDERER_USERTEXT", RENDERER_USERTEXT); + sundown[RENDERER_USERTEXT].main_renderer = make_custom_renderer(&usertext_state, snudown_default_render_flags, snudown_default_md_flags, 0); + sundown[RENDERER_USERTEXT].toc_renderer = make_custom_renderer(&usertext_toc_state, snudown_default_render_flags, snudown_default_md_flags, 1); + sundown[RENDERER_USERTEXT].state = &usertext_state; + sundown[RENDERER_USERTEXT].toc_state = &usertext_toc_state; +} + +void init_wiki_renderer(PyObject *module) { + PyModule_AddIntConstant(module, "RENDERER_WIKI", RENDERER_WIKI); + sundown[RENDERER_WIKI].main_renderer = make_custom_renderer(&wiki_state, snudown_wiki_render_flags, snudown_default_md_flags, 0); + sundown[RENDERER_WIKI].toc_renderer = make_custom_renderer(&wiki_toc_state, snudown_wiki_render_flags, snudown_default_md_flags, 1); + sundown[RENDERER_WIKI].state = &wiki_state; + sundown[RENDERER_WIKI].toc_state = &wiki_toc_state; +} + +static PyObject * +snudown_md(PyObject *self, PyObject *args, PyObject *kwargs) +{ + struct snudown_renderopt *options; + static char *kwlist[] = {"text", "nofollow", "target", "toc_id_prefix", "renderer", "enable_toc", NULL}; + + struct buf ib, *ob; + PyObject *py_result; + const char* result_text; + int renderer = RENDERER_USERTEXT; + int enable_toc = 0; + struct snudown_renderer _snudown; + int nofollow = 0; + char* target = NULL; + char* toc_id_prefix = NULL; + unsigned int flags; + + memset(&ib, 0x0, sizeof(struct buf)); + + /* Parse arguments */ + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|izzii", kwlist, + &ib.data, &ib.size, &nofollow, + &target, &toc_id_prefix, &renderer, &enable_toc)) { + return NULL; + } + + if (renderer < 0 || renderer >= RENDERER_COUNT) { + PyErr_SetString(PyExc_ValueError, "Invalid renderer"); + return NULL; + }; + + _snudown = sundown[renderer]; + + + + + _snudown.state->options; + options->nofollow = nofollow; + options->target = target; + + /* Output buffer */ + ob = bufnew(128); + + flags = options->html.flags; + + if (enable_toc) { + _snudown.toc_state->options.html.toc_id_prefix = toc_id_prefix; + sd_markdown_render(ob, ib.data, ib.size, _snudown.toc_renderer); + _snudown.toc_state->options.html.toc_id_prefix = NULL; + + options->html.flags |= HTML_TOC; + } + + options->html.toc_id_prefix = toc_id_prefix; + + /* do the magic */ + sd_markdown_render(ob, ib.data, ib.size, _snudown.main_renderer); + + options->html.toc_id_prefix = NULL; + options->html.flags = flags; + + /* make a Python string */ + result_text = ""; + if (ob->data) + result_text = (const char*)ob->data; + py_result = Py_BuildValue("s#", result_text, (int)ob->size); + + /* Cleanup */ + bufrelease(ob); + return py_result; +} + + +static PyMethodDef snudown_methods[] = { + {"markdown", (PyCFunction) snudown_md, METH_VARARGS | METH_KEYWORDS, snudown_md__doc__}, + {NULL, NULL, 0, NULL} /* Sentinel */ +}; +PyMODINIT_FUNC PyInit_snudown(void) +{ + PyObject *module; + + struct PyModuleDef wtf = { + PyModuleDef_HEAD_INIT, + "snudown", /* m_name */ + "This is snudown", /* m_doc */ + -1, /* m_size */ + snudown_methods, /* m_methods */ + NULL, /* m_reload */ + NULL, /* m_traverse */ + NULL, /* m_clear */ + NULL, /* m_free */ + }; + //module = Py_InitModule3("snudown", snudown_methods, snudown_module__doc__); + module = PyModule_Create(&wtf); + if (module == NULL) + return Py_BuildValue(""); + + init_default_renderer(module); + init_wiki_renderer(module); + + /* Version */ + PyModule_AddStringConstant(module, "__version__", SNUDOWN_VERSION); +}; +void initsnudown(void) +{ + (void) PyInit_snudown("snudown", snudown_methods); +}; \ No newline at end of file diff --git a/SnudownTest/src/autolink.c b/SnudownTest/src/autolink.c new file mode 100644 index 0000000..8d0e39a --- /dev/null +++ b/SnudownTest/src/autolink.c @@ -0,0 +1,487 @@ +/* + * Copyright (c) 2011, Vicent Marti + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "buffer.h" +#include "autolink.h" + +#include +#include +#include +#include + +#if defined(_WIN32) +#define strncasecmp _strnicmp +#endif + +int +sd_autolink_issafe(const uint8_t *link, size_t link_len) +{ + static const size_t valid_uris_count = 14; + static const char *valid_uris[] = { + "http://", "https://", "ftp://", "mailto://", + "/", "git://", "steam://", "irc://", "news://", "mumble://", + "ssh://", "ircs://", "ts3server://", "#" + }; + + size_t i; + + for (i = 0; i < valid_uris_count; ++i) { + size_t len = strlen(valid_uris[i]); + + if (link_len > len && + strncasecmp((char *)link, valid_uris[i], len) == 0 && + (isalnum(link[len]) || link[len] == '#' || link[len] == '/' || link[len] == '?')) + return 1; + } + + return 0; +} + +static size_t +autolink_delim(uint8_t *data, size_t link_end, size_t max_rewind, size_t size) +{ + uint8_t cclose, copen = 0; + size_t i; + + for (i = 0; i < link_end; ++i) + if (data[i] == '<') { + link_end = i; + break; + } + + while (link_end > 0) { + uint8_t c = data[link_end - 1]; + + if (c == 0) + break; + + if (strchr("?!.,", c) != NULL) + link_end--; + + else if (c == ';') { + size_t new_end = link_end - 2; + + while (new_end > 0 && isalpha(data[new_end])) + new_end--; + + if (new_end < link_end - 2 && data[new_end] == '&') + link_end = new_end; + else + link_end--; + } + else break; + } + + if (link_end == 0) + return 0; + + cclose = data[link_end - 1]; + + switch (cclose) { + case '"': copen = '"'; break; + case '\'': copen = '\''; break; + case ')': copen = '('; break; + case ']': copen = '['; break; + case '}': copen = '{'; break; + } + + if (copen != 0) { + size_t closing = 0; + size_t opening = 0; + size_t i = 0; + + /* Try to close the final punctuation sign in this same line; + * if we managed to close it outside of the URL, that means that it's + * not part of the URL. If it closes inside the URL, that means it + * is part of the URL. + * + * Examples: + * + * foo http://www.pokemon.com/Pikachu_(Electric) bar + * => http://www.pokemon.com/Pikachu_(Electric) + * + * foo (http://www.pokemon.com/Pikachu_(Electric)) bar + * => http://www.pokemon.com/Pikachu_(Electric) + * + * foo http://www.pokemon.com/Pikachu_(Electric)) bar + * => http://www.pokemon.com/Pikachu_(Electric)) + * + * (foo http://www.pokemon.com/Pikachu_(Electric)) bar + * => foo http://www.pokemon.com/Pikachu_(Electric) + */ + + while (i < link_end) { + if (data[i] == copen) + opening++; + else if (data[i] == cclose) + closing++; + + i++; + } + + if (closing != opening) + link_end--; + } + + return link_end; +} + +/* + * Checks that `prefix_char` occurs on a word boundary just before `data`, + * where `data` points to the character to search to the left of, and a word boundary + * is (currently) a whitespace character, punctuation, or the start of the string. + * Returns the length of the prefix. + */ +static int +check_reddit_autolink_prefix( + const uint8_t* data, + size_t max_rewind, + size_t max_lookbehind, + size_t size, + char prefix_char + ) +{ + /* Make sure this `/` is part of `/?r/` */ + if (size < 2 || max_rewind < 1 || data[-1] != prefix_char) + return 0; + + /* Not at the start of the buffer, no inlines to the immediate left of the `prefix_char` */ + if (max_rewind > 1) { + const char boundary = data[-2]; + if (boundary == '/') + return 2; + /** + * Here's where our lack of unicode-awareness bites us. We don't correctly + * match punctuation / whitespace characters for the boundary, because we + * reject valid cases like "。r/example" (note the fullwidth period.) + * + * A better implementation might try to rewind over bytes with the 8th bit set, try + * to decode them to a valid codepoint, then do a unicode-aware check on the codepoint. + */ + else if (ispunct(boundary) || isspace(boundary)) + return 1; + else + return 0; + } else if (max_lookbehind > 2) { + /* There's an inline element just left of the `prefix_char`, is it an escaped forward + * slash? bail out so we correctly handle stuff like "\/r/foo". This will also correctly + * allow "\\/r/foo". + */ + if (data[-2] == '/' && data[-3] == '\\') + return 0; + } + + /* Must be a new-style shortlink with nothing relevant to the left of it. */ + return 1; +} + +static size_t +check_domain(uint8_t *data, size_t size, int allow_short) +{ + size_t i, np = 0; + + if (!isalnum(data[0])) + return 0; + + for (i = 1; i < size - 1; ++i) { + if (data[i] == '.') np++; + else if (!isalnum(data[i]) && data[i] != '-') break; + } + + if (allow_short) { + /* We don't need a valid domain in the strict sense (with + * least one dot; so just make sure it's composed of valid + * domain characters and return the length of the the valid + * sequence. */ + return i; + } else { + /* a valid domain needs to have at least a dot. + * that's as far as we get */ + return np ? i : 0; + } +} + +size_t +sd_autolink__www( + size_t *rewind_p, + struct buf *link, + uint8_t *data, + size_t max_rewind, + size_t size, + unsigned int flags) +{ + size_t link_end; + + if (max_rewind > 0 && !ispunct(data[-1]) && !isspace(data[-1])) + return 0; + + if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0) + return 0; + + link_end = check_domain(data, size, 0); + + if (link_end == 0) + return 0; + + while (link_end < size && !isspace(data[link_end])) + link_end++; + + link_end = autolink_delim(data, link_end, max_rewind, size); + + if (link_end == 0) + return 0; + + bufput(link, data, link_end); + *rewind_p = 0; + + return (int)link_end; +} + +size_t +sd_autolink__email( + size_t *rewind_p, + struct buf *link, + uint8_t *data, + size_t max_rewind, + size_t size, + unsigned int flags) +{ + size_t link_end, rewind; + int nb = 0, np = 0; + + for (rewind = 0; rewind < max_rewind; ++rewind) { + uint8_t c = data[-rewind - 1]; + + if (c == 0) + break; + + if (isalnum(c)) + continue; + + if (strchr(".+-_", c) != NULL) + continue; + + break; + } + + if (rewind == 0) + return 0; + + for (link_end = 0; link_end < size; ++link_end) { + uint8_t c = data[link_end]; + + if (isalnum(c)) + continue; + + if (c == '@') + nb++; + else if (c == '.' && link_end < size - 1) + np++; + else if (c != '-' && c != '_') + break; + } + + if (link_end < 2 || nb != 1 || np == 0) + return 0; + + link_end = autolink_delim(data, link_end, max_rewind, size); + + if (link_end == 0) + return 0; + + bufput(link, data - rewind, link_end + rewind); + *rewind_p = rewind; + + return link_end; +} + +size_t +sd_autolink__url( + size_t *rewind_p, + struct buf *link, + uint8_t *data, + size_t max_rewind, + size_t size, + unsigned int flags) +{ + size_t link_end, rewind = 0, domain_len; + + if (size < 4 || data[1] != '/' || data[2] != '/') + return 0; + + while (rewind < max_rewind && isalpha(data[-rewind - 1])) + rewind++; + + if (!sd_autolink_issafe(data - rewind, size + rewind)) + return 0; + + link_end = strlen("://"); + + domain_len = check_domain( + data + link_end, + size - link_end, + flags & SD_AUTOLINK_SHORT_DOMAINS); + + if (domain_len == 0) + return 0; + + link_end += domain_len; + while (link_end < size && !isspace(data[link_end])) + link_end++; + + link_end = autolink_delim(data, link_end, max_rewind, size); + + if (link_end == 0) + return 0; + + bufput(link, data - rewind, link_end + rewind); + *rewind_p = rewind; + + return link_end; +} + +size_t +sd_autolink__subreddit( + size_t *rewind_p, + struct buf *link, + uint8_t *data, + size_t max_rewind, + size_t max_lookbehind, + size_t size, + int *no_slash + ) +{ + /** + * This is meant to handle both r/foo and /r/foo style subreddit references. + * In a valid /?r/ link, `*data` will always point to the '/' after the first 'r'. + * In pseudo-regex, this matches something like: + * + * `(/|(?<=\b))r/(all-)?%subreddit%([-+]%subreddit%)*(/[\w\-/]*)?` + * where %subreddit% == `((t:)?\w{2,24}|reddit\.com)` + */ + size_t link_end; + size_t rewind; + int is_allminus = 0; + + rewind = check_reddit_autolink_prefix(data, max_rewind, max_lookbehind, size, 'r'); + if (!rewind) + return 0; + + /* offset to the "meat" of the link */ + link_end = strlen("/"); + + if (size >= link_end + 4 && strncasecmp((char*)data + link_end, "all-", 4) == 0) + is_allminus = 1; + + do { + size_t start = link_end; + int max_length = 24; + + /* special case: /r/reddit.com (only subreddit containing '.'). */ + if ( size >= link_end+10 && strncasecmp((char*)data+link_end, "reddit.com", 10) == 0 ) { + link_end += 10; + /* Make sure there are no trailing characters (don't do + * any autolinking for /r/reddit.commission) */ + max_length = 10; + } + + /* If not a special case, verify it begins with (t:)?[A-Za-z0-9] */ + else { + /* support autolinking to timereddits, /r/t:when (1 April 2012) */ + if ( size > link_end+2 && strncasecmp((char*)data+link_end, "t:", 2) == 0 ) + link_end += 2; /* Jump over the 't:' */ + + /* the first character of a subreddit name must be a letter or digit */ + if (!isalnum(data[link_end])) + return 0; + link_end += 1; + } + + /* consume valid characters ([A-Za-z0-9_]) until we run out */ + while (link_end < size && (isalnum(data[link_end]) || + data[link_end] == '_')) + link_end++; + + /* valid subreddit names are between 3 and 21 characters, with + * some subreddits having 2-character names. Don't bother with + * autolinking for anything outside this length range. + * (chksrname function in reddit/.../validator.py) */ + if ( link_end-start < 2 || link_end-start > max_length ) + return 0; + + /* If we are linking to a multireddit, continue */ + } while ( link_end < size && (data[link_end] == '+' || (is_allminus && data[link_end] == '-')) && link_end++ ); + + if (link_end < size && data[link_end] == '/') { + while (link_end < size && (isalnum(data[link_end]) || + data[link_end] == '_' || + data[link_end] == '/' || + data[link_end] == '-')) + link_end++; + } + + /* make the link */ + bufput(link, data - rewind, link_end + rewind); + + *no_slash = (rewind == 1); + *rewind_p = rewind; + + return link_end; +} + +size_t +sd_autolink__username( + size_t *rewind_p, + struct buf *link, + uint8_t *data, + size_t max_rewind, + size_t max_lookbehind, + size_t size, + int *no_slash + ) +{ + size_t link_end; + size_t rewind; + + if (size < 3) + return 0; + + rewind = check_reddit_autolink_prefix(data, max_rewind, max_lookbehind, size, 'u'); + if (!rewind) + return 0; + + link_end = strlen("/"); + + /* the first letter of a username must... well, be valid, we don't care otherwise */ + if (!isalnum(data[link_end]) && data[link_end] != '_' && data[link_end] != '-') + return 0; + link_end += 1; + + /* consume valid characters ([A-Za-z0-9_-/]) until we run out */ + while (link_end < size && (isalnum(data[link_end]) || + data[link_end] == '_' || + data[link_end] == '/' || + data[link_end] == '-')) + link_end++; + + /* make the link */ + bufput(link, data - rewind, link_end + rewind); + + *no_slash = (rewind == 1); + *rewind_p = rewind; + + return link_end; +} diff --git a/SnudownTest/src/autolink.h b/SnudownTest/src/autolink.h new file mode 100644 index 0000000..55b7aaa --- /dev/null +++ b/SnudownTest/src/autolink.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2011, Vicent Marti + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef UPSKIRT_AUTOLINK_H +#define UPSKIRT_AUTOLINK_H + +#include "buffer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + SD_AUTOLINK_SHORT_DOMAINS = (1 << 0), +}; + +int +sd_autolink_issafe(const uint8_t *link, size_t link_len); + +size_t +sd_autolink__www(size_t *rewind_p, struct buf *link, + uint8_t *data, size_t max_rewind, size_t size, unsigned int flags); + +size_t +sd_autolink__email(size_t *rewind_p, struct buf *link, + uint8_t *data, size_t max_rewind, size_t size, unsigned int flags); + +size_t +sd_autolink__url(size_t *rewind_p, struct buf *link, + uint8_t *data, size_t max_rewind, size_t size, unsigned int flags); + +extern size_t +sd_autolink__subreddit(size_t *rewind_p, struct buf *link, uint8_t *data, + size_t max_rewind, size_t max_lookbehind, size_t size, int *no_slash); + +extern size_t +sd_autolink__username(size_t *rewind_p, struct buf *link, uint8_t *data, + size_t max_rewind, size_t max_lookbehind, size_t size, int *no_slash); + +#ifdef __cplusplus +} +#endif + +#endif + +/* vim: set filetype=c: */ diff --git a/SnudownTest/src/buffer.c b/SnudownTest/src/buffer.c new file mode 100644 index 0000000..ab18948 --- /dev/null +++ b/SnudownTest/src/buffer.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2008, Natacha Porté + * Copyright (c) 2011, Vicent Martí + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#define BUFFER_MAX_ALLOC_SIZE (1024 * 1024 * 16) //16mb + +#include "buffer.h" + +#include +#include +#include +#include + +/* MSVC compat */ +#if defined(_MSC_VER) +# define _buf_vsnprintf _vsnprintf +#else +# define _buf_vsnprintf vsnprintf +#endif + +int +bufprefix(const struct buf *buf, const char *prefix) +{ + size_t i; + assert(buf && buf->unit); + + for (i = 0; i < buf->size; ++i) { + if (prefix[i] == 0) + return 0; + + if (buf->data[i] != prefix[i]) + return buf->data[i] - prefix[i]; + } + + return 0; +} + +/* bufgrow: increasing the allocated size to the given value */ +int +bufgrow(struct buf *buf, size_t neosz) +{ + size_t neoasz; + void *neodata; + + assert(buf && buf->unit); + + if (neosz > BUFFER_MAX_ALLOC_SIZE) + return BUF_ENOMEM; + + if (buf->asize >= neosz) + return BUF_OK; + + neoasz = buf->asize + buf->unit; + while (neoasz < neosz) + neoasz += buf->unit; + + neodata = realloc(buf->data, neoasz); + if (!neodata) + return BUF_ENOMEM; + + buf->data = neodata; + buf->asize = neoasz; + return BUF_OK; +} + + +/* bufnew: allocation of a new buffer */ +struct buf * +bufnew(size_t unit) +{ + struct buf *ret; + ret = malloc(sizeof (struct buf)); + + if (ret) { + ret->data = 0; + ret->size = ret->asize = 0; + ret->unit = unit; + } + return ret; +} + +/* bufnullterm: NULL-termination of the string array */ +const char * +bufcstr(struct buf *buf) +{ + assert(buf && buf->unit); + + if (buf->size < buf->asize && buf->data[buf->size] == 0) + return (char *)buf->data; + + if (buf->size + 1 <= buf->asize || bufgrow(buf, buf->size + 1) == 0) { + buf->data[buf->size] = 0; + return (char *)buf->data; + } + + return NULL; +} + +/* bufprintf: formatted printing to a buffer */ +void +bufprintf(struct buf *buf, const char *fmt, ...) +{ + va_list ap; + int n; + + assert(buf && buf->unit); + + if (buf->size >= buf->asize && bufgrow(buf, buf->size + 1) < 0) + return; + va_start(ap, fmt); + n = _buf_vsnprintf((char *)buf->data + buf->size, buf->asize - buf->size, fmt, ap); + va_end(ap); + + if (n < 0) { +#ifdef _MSC_VER + va_start(ap, fmt); + n = _vscprintf(fmt, ap); + va_end(ap); +#else + return; +#endif + } + if ((size_t)n >= buf->asize - buf->size) { + if (bufgrow(buf, buf->size + n + 1) < 0) + return; + + va_start(ap, fmt); + n = _buf_vsnprintf((char *)buf->data + buf->size, buf->asize - buf->size, fmt, ap); + va_end(ap); + } + + if (n < 0) + return; + + buf->size += n; +} + +/* bufput: appends raw data to a buffer */ +void +bufput(struct buf *buf, const void *data, size_t len) +{ + assert(buf && buf->unit); + + if (buf->size + len > buf->asize && bufgrow(buf, buf->size + len) < 0) + return; + + memcpy(buf->data + buf->size, data, len); + buf->size += len; +} + +/* bufputs: appends a NUL-terminated string to a buffer */ +void +bufputs(struct buf *buf, const char *str) +{ + bufput(buf, str, strlen(str)); +} + + +/* bufputc: appends a single uint8_t to a buffer */ +void +bufputc(struct buf *buf, int c) +{ + assert(buf && buf->unit); + + if (buf->size + 1 > buf->asize && bufgrow(buf, buf->size + 1) < 0) + return; + + buf->data[buf->size] = c; + buf->size += 1; +} + +/* bufrelease: decrease the reference count and free the buffer if needed */ +void +bufrelease(struct buf *buf) +{ + if (!buf) + return; + + free(buf->data); + free(buf); +} + + +/* bufreset: frees internal data of the buffer */ +void +bufreset(struct buf *buf) +{ + if (!buf) + return; + + free(buf->data); + buf->data = NULL; + buf->size = buf->asize = 0; +} + +/* bufslurp: removes a given number of bytes from the head of the array */ +void +bufslurp(struct buf *buf, size_t len) +{ + assert(buf && buf->unit); + + if (len >= buf->size) { + buf->size = 0; + return; + } + + buf->size -= len; + memmove(buf->data, buf->data + len, buf->size); +} + +/* buftrucate: truncates the buffer at `size` */ +int +buftruncate(struct buf *buf, size_t size) +{ + if (buf->size < size || size < 0) { + /* bail out in debug mode so we can figure out why this happened */ + assert(0); + return BUF_EINVALIDIDX; + } + + buf->size = size; + return BUF_OK; +} diff --git a/SnudownTest/src/buffer.h b/SnudownTest/src/buffer.h new file mode 100644 index 0000000..ab98ab6 --- /dev/null +++ b/SnudownTest/src/buffer.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2008, Natacha Porté + * Copyright (c) 2011, Vicent Martí + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef BUFFER_H__ +#define BUFFER_H__ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_MSC_VER) +#define __attribute__(x) +#define inline +#endif + +typedef enum { + BUF_OK = 0, + BUF_ENOMEM = -1, + BUF_EINVALIDIDX = -2, +} buferror_t; + +/* struct buf: character array buffer */ +struct buf { + uint8_t *data; /* actual character data */ + size_t size; /* size of the string */ + size_t asize; /* allocated size (0 = volatile buffer) */ + size_t unit; /* reallocation unit size (0 = read-only buffer) */ +}; + +/* CONST_BUF: global buffer from a string litteral */ +#define BUF_STATIC(string) \ + { (uint8_t *)string, sizeof string -1, sizeof string, 0, 0 } + +/* VOLATILE_BUF: macro for creating a volatile buffer on the stack */ +#define BUF_VOLATILE(strname) \ + { (uint8_t *)strname, strlen(strname), 0, 0, 0 } + +/* BUFPUTSL: optimized bufputs of a string litteral */ +#define BUFPUTSL(output, literal) \ + bufput(output, literal, sizeof literal - 1) + +/* bufgrow: increasing the allocated size to the given value */ +int bufgrow(struct buf *, size_t); + +/* bufnew: allocation of a new buffer */ +struct buf *bufnew(size_t) __attribute__ ((malloc)); + +/* bufnullterm: NUL-termination of the string array (making a C-string) */ +const char *bufcstr(struct buf *); + +/* bufprefix: compare the beginning of a buffer with a string */ +int bufprefix(const struct buf *buf, const char *prefix); + +/* bufput: appends raw data to a buffer */ +void bufput(struct buf *, const void *, size_t); + +/* bufputs: appends a NUL-terminated string to a buffer */ +void bufputs(struct buf *, const char *); + +/* bufputc: appends a single char to a buffer */ +void bufputc(struct buf *, int); + +/* bufrelease: decrease the reference count and free the buffer if needed */ +void bufrelease(struct buf *); + +/* bufreset: frees internal data of the buffer */ +void bufreset(struct buf *); + +/* bufslurp: removes a given number of bytes from the head of the array */ +void bufslurp(struct buf *, size_t); + +/* bufprintf: formatted printing to a buffer */ +void bufprintf(struct buf *, const char *, ...) __attribute__ ((format (printf, 2, 3))); + +/* buftruncate: truncates the buffer at `size` */ +int buftruncate(struct buf *buf, size_t size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/SnudownTest/src/html_blocks.h b/SnudownTest/src/html_blocks.h new file mode 100644 index 0000000..09a758f --- /dev/null +++ b/SnudownTest/src/html_blocks.h @@ -0,0 +1,206 @@ +/* C code produced by gperf version 3.0.3 */ +/* Command-line: gperf -N find_block_tag -H hash_block_tag -C -c -E --ignore-case html_block_names.txt */ +/* Computed positions: -k'1-2' */ + +#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ + && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \ + && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \ + && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \ + && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \ + && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \ + && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \ + && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \ + && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \ + && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \ + && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \ + && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \ + && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \ + && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \ + && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \ + && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \ + && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \ + && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \ + && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \ + && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \ + && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \ + && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \ + && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)) +/* The character set is not based on ISO-646. */ +error "gperf generated tables don't work with this execution character set. Please report a bug to ." +#endif + +/* maximum key range = 37, duplicates = 0 */ + +#ifndef GPERF_DOWNCASE +#define GPERF_DOWNCASE 1 +static unsigned char gperf_downcase[256] = + { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, + 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, + 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255 + }; +#endif + +#ifndef GPERF_CASE_STRNCMP +#define GPERF_CASE_STRNCMP 1 +static int +gperf_case_strncmp (s1, s2, n) + register const char *s1; + register const char *s2; + register unsigned int n; +{ + for (; n > 0;) + { + unsigned char c1 = gperf_downcase[(unsigned char)*s1++]; + unsigned char c2 = gperf_downcase[(unsigned char)*s2++]; + if (c1 != 0 && c1 == c2) + { + n--; + continue; + } + return (int)c1 - (int)c2; + } + return 0; +} +#endif + +#ifdef __GNUC__ +__inline +#else +#ifdef __cplusplus +inline +#endif +#endif +static unsigned int +hash_block_tag (str, len) + register const char *str; + register unsigned int len; +{ + static const unsigned char asso_values[] = + { + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 8, 30, 25, 20, 15, 10, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 0, 38, 0, 38, + 5, 5, 5, 15, 0, 38, 38, 0, 15, 10, + 0, 38, 38, 15, 0, 5, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 0, 38, + 0, 38, 5, 5, 5, 15, 0, 38, 38, 0, + 15, 10, 0, 38, 38, 15, 0, 5, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38 + }; + register int hval = len; + + switch (hval) + { + default: + hval += asso_values[(unsigned char)str[1]+1]; + /*FALLTHROUGH*/ + case 1: + hval += asso_values[(unsigned char)str[0]]; + break; + } + return hval; +} + +#ifdef __GNUC__ +__inline +#ifdef __GNUC_STDC_INLINE__ +__attribute__ ((__gnu_inline__)) +#endif +#endif +const char * +find_block_tag (str, len) + register const char *str; + register unsigned int len; +{ + enum + { + TOTAL_KEYWORDS = 24, + MIN_WORD_LENGTH = 1, + MAX_WORD_LENGTH = 10, + MIN_HASH_VALUE = 1, + MAX_HASH_VALUE = 37 + }; + + static const char * const wordlist[] = + { + "", + "p", + "dl", + "div", + "math", + "table", + "", + "ul", + "del", + "form", + "blockquote", + "figure", + "ol", + "fieldset", + "", + "h1", + "", + "h6", + "pre", + "", "", + "script", + "h5", + "noscript", + "", + "style", + "iframe", + "h4", + "ins", + "", "", "", + "h3", + "", "", "", "", + "h2" + }; + + if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) + { + register int key = hash_block_tag (str, len); + + if (key <= MAX_HASH_VALUE && key >= 0) + { + register const char *s = wordlist[key]; + + if ((((unsigned char)*str ^ (unsigned char)*s) & ~32) == 0 && !gperf_case_strncmp (str, s, len) && s[len] == '\0') + return s; + } + } + return 0; +} diff --git a/SnudownTest/src/html_entities.gperf b/SnudownTest/src/html_entities.gperf new file mode 100644 index 0000000..543103d --- /dev/null +++ b/SnudownTest/src/html_entities.gperf @@ -0,0 +1,292 @@ +%language=ANSI-C +%define lookup-function-name is_allowed_named_entity +%compare-strncmp +%readonly-tables +%define hash-function-name hash_html_entity +%enum +%includes +%{ +#include + +/* Parsers tend to choke on entities with values greater than this */ +const u_int32_t max_num_entity_val = 0x10ffff; +/* Any numeric entity longer than this is obviously above max_num_entity_val + * used to avoid dealing with overflows. */ +const size_t MAX_NUM_ENTITY_LEN = 7; + +inline int is_valid_numeric_entity(uint32_t entity_val) +{ + /* Some XML parsers will choke on entities with certain + * values (mostly control characters.) + * + * According to lxml these are all problematic: + * + * [xrange(0, 8), + * xrange(11, 12), + * xrange(14, 31), + * xrange(55296, 57343), + * xrange(65534, 65535)] + */ + return (entity_val > 8 + && (entity_val != 11 && entity_val != 12) + && (entity_val < 14 || entity_val > 31) + && (entity_val < 55296 || entity_val > 57343) + && (entity_val != 65534 && entity_val != 65535) + && entity_val <= max_num_entity_val); +} + +%} +%% +Æ +Á + +À +Α +Å +à +Ä +Β +Ç +Χ +‡ +Δ +Ð +É +Ê +È +Ε +Η +Ë +Γ +Í +Î +Ì +Ι +Ï +Κ +Λ +Μ +Ñ +Ν +Œ +Ó +Ô +Ò +Ω +Ο +Ø +Õ +Ö +Φ +Π +″ +Ψ +Ρ +Š +Σ +Þ +Τ +Θ +Ú +Û +Ù +Υ +Ü +Ξ +Ý +Ÿ +Ζ +á +â +´ +æ +à +ℵ +α +& +∧ +∠ +' +å +≈ +ã +ä +„ +β +¦ +• +∩ +ç +¸ +¢ +χ +ˆ +♣ +≅ +© +↵ +∪ +¤ +⇓ +† +↓ +° +δ +♦ +÷ +é +ê +è +∅ +  +  +ε +≡ +η +ð +ë +€ +∃ +ƒ +∀ +½ +¼ +¾ +⁄ +γ +≥ +> +⇔ +↔ +♥ +… +í +î +¡ +ì +ℑ +∞ +∫ +ι +¿ +∈ +ï +κ +⇐ +λ +⟨ +« +← +⌈ +“ +≤ +⌊ +∗ +◊ +‎ +‹ +‘ +< +¯ +— +µ +· +− +μ +∇ +  +– +≠ +∋ +¬ +∉ +⊄ +ñ +ν +ó +ô +œ +ò +‾ +ω +ο +⊕ +∨ +ª +º +ø +õ +⊗ +ö +¶ +∂ +‰ +⊥ +φ +π +ϖ +± +£ +′ +∏ +∝ +ψ +" +⇒ +√ +⟩ +» +→ +⌉ +” +ℜ +® +⌋ +ρ +‏ +› +’ +‚ +š +⋅ +§ +­ +σ +ς +∼ +♠ +⊂ +⊆ +∑ +¹ +² +³ +⊃ +⊇ +ß +τ +∴ +θ +ϑ +  +þ +˜ +× +™ +⇑ +ú +↑ +û +ù +¨ +ϒ +υ +ü +℘ +ξ +ý +¥ +ÿ +ζ +‍ +‌ diff --git a/SnudownTest/src/markdown.c b/SnudownTest/src/markdown.c new file mode 100644 index 0000000..abe4a1d --- /dev/null +++ b/SnudownTest/src/markdown.c @@ -0,0 +1,2661 @@ +/* markdown.c - generic markdown parser */ + +/* + * Copyright (c) 2009, Natacha Porté + * Copyright (c) 2011, Vicent Marti + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "markdown.h" +#include "stack.h" + +#include +#include +#include +#include + +#if defined(_WIN32) +#define strncasecmp _strnicmp +#endif + +#define REF_TABLE_SIZE 8 + +#define BUFFER_BLOCK 0 +#define BUFFER_SPAN 1 + +#define MKD_LI_END 8 /* internal list flag */ + +#define gperf_case_strncmp(s1, s2, n) strncasecmp(s1, s2, n) +#define GPERF_DOWNCASE 1 +#define GPERF_CASE_STRNCMP 1 +#include "html_blocks.h" +#include "html_entities.h" + +/*************** + * LOCAL TYPES * + ***************/ + +/* link_ref: reference to a link */ +struct link_ref { + unsigned int id; + + struct buf *link; + struct buf *title; + + struct link_ref *next; +}; + +/* char_trigger: function pointer to render active chars */ +/* returns the number of chars taken care of */ +/* data is the pointer of the beginning of the span */ +/* offset is the number of valid chars before data */ +struct sd_markdown; +typedef size_t +(*char_trigger)(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); + +static size_t char_emphasis(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_linebreak(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_codespan(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_escape(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_entity(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_langle_tag(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_autolink_url(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_autolink_email(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_autolink_www(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_autolink_subreddit_or_username(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_link(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); +static size_t char_superscript(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size); + +enum markdown_char_t { + MD_CHAR_NONE = 0, + MD_CHAR_EMPHASIS, + MD_CHAR_CODESPAN, + MD_CHAR_LINEBREAK, + MD_CHAR_LINK, + MD_CHAR_LANGLE, + MD_CHAR_ESCAPE, + MD_CHAR_ENTITITY, + MD_CHAR_AUTOLINK_URL, + MD_CHAR_AUTOLINK_EMAIL, + MD_CHAR_AUTOLINK_WWW, + MD_CHAR_AUTOLINK_SUBREDDIT_OR_USERNAME, + MD_CHAR_SUPERSCRIPT, +}; + +static char_trigger markdown_char_ptrs[] = { + NULL, + &char_emphasis, + &char_codespan, + &char_linebreak, + &char_link, + &char_langle_tag, + &char_escape, + &char_entity, + &char_autolink_url, + &char_autolink_email, + &char_autolink_www, + &char_autolink_subreddit_or_username, + &char_superscript, +}; + +/* render • structure containing one particular render */ +struct sd_markdown { + struct sd_callbacks cb; + void *opaque; + + struct link_ref *refs[REF_TABLE_SIZE]; + uint8_t active_char[256]; + struct stack work_bufs[2]; + unsigned int ext_flags; + size_t max_nesting; + size_t max_table_cols; + int in_link_body; +}; + +/*************************** + * HELPER FUNCTIONS * + ***************************/ + +static inline struct buf * +rndr_newbuf(struct sd_markdown *rndr, int type) +{ + static const size_t buf_size[2] = {256, 64}; + struct buf *work = NULL; + struct stack *pool = &rndr->work_bufs[type]; + + if (pool->size < pool->asize && + pool->item[pool->size] != NULL) { + work = pool->item[pool->size++]; + work->size = 0; + } else { + work = bufnew(buf_size[type]); + stack_push(pool, work); + } + + return work; +} + +static inline void +rndr_popbuf(struct sd_markdown *rndr, int type) +{ + rndr->work_bufs[type].size--; +} + +static void +unscape_text(struct buf *ob, struct buf *src) +{ + size_t i = 0, org; + while (i < src->size) { + org = i; + while (i < src->size && src->data[i] != '\\') + i++; + + if (i > org) + bufput(ob, src->data + org, i - org); + + if (i + 1 >= src->size) + break; + + bufputc(ob, src->data[i + 1]); + i += 2; + } +} + +static unsigned int +hash_link_ref(const uint8_t *link_ref, size_t length) +{ + size_t i; + unsigned int hash = 0; + + for (i = 0; i < length; ++i) + hash = tolower(link_ref[i]) + (hash << 6) + (hash << 16) - hash; + + return hash; +} + +static struct link_ref * +add_link_ref( + struct link_ref **references, + const uint8_t *name, size_t name_size) +{ + struct link_ref *ref = calloc(1, sizeof(struct link_ref)); + + if (!ref) + return NULL; + + ref->id = hash_link_ref(name, name_size); + ref->next = references[ref->id % REF_TABLE_SIZE]; + + references[ref->id % REF_TABLE_SIZE] = ref; + return ref; +} + +static struct link_ref * +find_link_ref(struct link_ref **references, uint8_t *name, size_t length) +{ + unsigned int hash = hash_link_ref(name, length); + struct link_ref *ref = NULL; + + ref = references[hash % REF_TABLE_SIZE]; + + while (ref != NULL) { + if (ref->id == hash) + return ref; + + ref = ref->next; + } + + return NULL; +} + +static void +free_link_refs(struct link_ref **references) +{ + size_t i; + + for (i = 0; i < REF_TABLE_SIZE; ++i) { + struct link_ref *r = references[i]; + struct link_ref *next; + + while (r) { + next = r->next; + bufrelease(r->link); + bufrelease(r->title); + free(r); + r = next; + } + } +} + +/* + * Check whether a char is a Markdown space. + + * Right now we only consider spaces the actual + * space and a newline: tabs and carriage returns + * are filtered out during the preprocessing phase. + * + * If we wanted to actually be UTF-8 compliant, we + * should instead extract an Unicode codepoint from + * this character and check for space properties. + */ +static inline int +_isspace(int c) +{ + return c == ' ' || c == '\n'; +} + +/**************************** + * INLINE PARSING FUNCTIONS * + ****************************/ + +/* is_mail_autolink • looks for the address part of a mail autolink and '>' */ +/* this is less strict than the original markdown e-mail address matching */ +static size_t +is_mail_autolink(uint8_t *data, size_t size) +{ + size_t i = 0, nb = 0; + + /* address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@' */ + for (i = 0; i < size; ++i) { + if (isalnum(data[i])) + continue; + + switch (data[i]) { + case '@': + nb++; + + case '-': + case '.': + case '_': + break; + + case '>': + return (nb == 1) ? i + 1 : 0; + + default: + return 0; + } + } + + return 0; +} + +/* tag_length • returns the length of the given tag, or 0 is it's not valid */ +static size_t +tag_length(uint8_t *data, size_t size, enum mkd_autolink *autolink) +{ + size_t i, j; + + /* a valid tag can't be shorter than 3 chars */ + if (size < 3) return 0; + + /* begins with a '<' optionally followed by '/', followed by letter or number */ + if (data[0] != '<') return 0; + i = (data[1] == '/') ? 2 : 1; + + if (!isalnum(data[i])) + return 0; + + /* scheme test */ + *autolink = MKDA_NOT_AUTOLINK; + + /* try to find the beginning of an URI */ + while (i < size && (isalnum(data[i]) || data[i] == '.' || data[i] == '+' || data[i] == '-')) + i++; + + if (i > 1 && data[i] == '@') { + if ((j = is_mail_autolink(data + i, size - i)) != 0) { + *autolink = MKDA_EMAIL; + return i + j; + } + } + + if (i > 2 && data[i] == ':') { + *autolink = MKDA_NORMAL; + i++; + } + + /* completing autolink test: no whitespace or ' or " */ + if (i >= size) + *autolink = MKDA_NOT_AUTOLINK; + + else if (*autolink) { + j = i; + + while (i < size) { + if (data[i] == '\\') i += 2; + else if (data[i] == '>' || data[i] == '\'' || + data[i] == '"' || data[i] == ' ' || data[i] == '\n') + break; + else i++; + } + + if (i >= size) return 0; + if (i > j && data[i] == '>') return i + 1; + /* one of the forbidden chars has been found */ + *autolink = MKDA_NOT_AUTOLINK; + } + + /* looking for sometinhg looking like a tag end */ + while (i < size && data[i] != '>') i++; + if (i >= size) return 0; + return i + 1; +} + +/* parse_inline • parses inline markdown elements */ +static void +parse_inline(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size) +{ + size_t i = 0, end = 0, last_special = 0; + uint8_t action = 0; + struct buf work = { 0, 0, 0, 0 }; + + if (rndr->work_bufs[BUFFER_SPAN].size + + rndr->work_bufs[BUFFER_BLOCK].size > rndr->max_nesting) + return; + + while (i < size) { + /* copying inactive chars into the output */ + while (end < size && (action = rndr->active_char[data[end]]) == 0) { + end++; + } + + if (rndr->cb.normal_text) { + work.data = data + i; + work.size = end - i; + rndr->cb.normal_text(ob, &work, rndr->opaque); + } + else + bufput(ob, data + i, end - i); + + if (end >= size) break; + i = end; + + end = markdown_char_ptrs[(int)action](ob, rndr, data + i, i - last_special, i, size - i); + if (!end) /* no action from the callback */ + end = i + 1; + else { + i += end; + last_special = end = i; + } + } +} + +/* find_emph_char • looks for the next emph uint8_t, skipping other constructs */ +static size_t +find_emph_char(uint8_t *data, size_t size, uint8_t c) +{ + size_t i = 1; + + while (i < size) { + while (i < size && data[i] != c && data[i] != '`' && data[i] != '[') + i++; + + if (i == size) + return 0; + + if (data[i] == c) + return i; + + /* not counting escaped chars */ + if (i && data[i - 1] == '\\') { + i++; continue; + } + + if (data[i] == '`') { + size_t span_nb = 0, bt; + size_t tmp_i = 0; + + /* counting the number of opening backticks */ + while (i < size && data[i] == '`') { + i++; span_nb++; + } + + if (i >= size) return 0; + + /* finding the matching closing sequence */ + bt = 0; + while (i < size && bt < span_nb) { + if (!tmp_i && data[i] == c) tmp_i = i; + if (data[i] == '`') bt++; + else bt = 0; + i++; + } + + if (i >= size) return tmp_i; + } + /* skipping a link */ + else if (data[i] == '[') { + size_t tmp_i = 0; + uint8_t cc; + + i++; + while (i < size && data[i] != ']') { + if (!tmp_i && data[i] == c) tmp_i = i; + i++; + } + + i++; + while (i < size && (data[i] == ' ' || data[i] == '\n')) + i++; + + if (i >= size) + return tmp_i; + + switch (data[i]) { + case '[': + cc = ']'; break; + + case '(': + cc = ')'; break; + + default: + if (tmp_i) + return tmp_i; + else + continue; + } + + i++; + while (i < size && data[i] != cc) { + if (!tmp_i && data[i] == c) tmp_i = i; + i++; + } + + if (i >= size) + return tmp_i; + + i++; + } + } + + return 0; +} + +/* parse_emph1 • parsing single emphase */ +/* closed by a symbol not preceded by whitespace and not followed by symbol */ +static size_t +parse_emph1(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size, uint8_t c) +{ + size_t i = 0, len; + struct buf *work = 0; + int r; + + if (!rndr->cb.emphasis) return 0; + + /* skipping one symbol if coming from emph3 */ + if (size > 1 && data[0] == c && data[1] == c) i = 1; + + while (i < size) { + len = find_emph_char(data + i, size - i, c); + if (!len) return 0; + i += len; + if (i >= size) return 0; + + if (data[i] == c && !_isspace(data[i - 1])) { + if ((rndr->ext_flags & MKDEXT_NO_INTRA_EMPHASIS) && (c == '_')) { + if (!(i + 1 == size || _isspace(data[i + 1]) || ispunct(data[i + 1]))) + continue; + } + + work = rndr_newbuf(rndr, BUFFER_SPAN); + parse_inline(work, rndr, data, i); + r = rndr->cb.emphasis(ob, work, rndr->opaque); + rndr_popbuf(rndr, BUFFER_SPAN); + return r ? i + 1 : 0; + } + } + + return 0; +} + +/* parse_emph2 • parsing single emphase */ +static size_t +parse_emph2(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size, uint8_t c) +{ + int (*render_method)(struct buf *ob, const struct buf *text, void *opaque); + size_t i = 0, len; + struct buf *work = 0; + int r; + + render_method = (c == '~') ? rndr->cb.strikethrough : rndr->cb.double_emphasis; + + if (!render_method) + return 0; + + while (i < size) { + len = find_emph_char(data + i, size - i, c); + if (!len) return 0; + i += len; + + if (i + 1 < size && data[i] == c && data[i + 1] == c && i && !_isspace(data[i - 1])) { + work = rndr_newbuf(rndr, BUFFER_SPAN); + parse_inline(work, rndr, data, i); + r = render_method(ob, work, rndr->opaque); + rndr_popbuf(rndr, BUFFER_SPAN); + return r ? i + 2 : 0; + } + i++; + } + return 0; +} + +/* parse_emph3 • parsing single emphase */ +/* finds the first closing tag, and delegates to the other emph */ +static size_t +parse_emph3(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size, uint8_t c) +{ + size_t i = 0, len; + int r; + + while (i < size) { + len = find_emph_char(data + i, size - i, c); + if (!len) return 0; + i += len; + + /* skip whitespace preceded symbols */ + if (data[i] != c || _isspace(data[i - 1])) + continue; + + if (i + 2 < size && data[i + 1] == c && data[i + 2] == c && rndr->cb.triple_emphasis) { + /* triple symbol found */ + struct buf *work = rndr_newbuf(rndr, BUFFER_SPAN); + + parse_inline(work, rndr, data, i); + r = rndr->cb.triple_emphasis(ob, work, rndr->opaque); + rndr_popbuf(rndr, BUFFER_SPAN); + return r ? i + 3 : 0; + + } else if (i + 1 < size && data[i + 1] == c) { + /* double symbol found, handing over to emph1 */ + len = parse_emph1(ob, rndr, data - 2, size + 2, c); + if (!len) return 0; + else return len - 2; + + } else { + /* single symbol found, handing over to emph2 */ + len = parse_emph2(ob, rndr, data - 1, size + 1, c); + if (!len) return 0; + else return len - 1; + } + } + return 0; +} + +/* char_emphasis • single and double emphasis parsing */ +static size_t +char_emphasis(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + uint8_t c = data[0]; + size_t ret; + + if (size > 2 && data[1] != c) { + /* whitespace cannot follow an opening emphasis; + * strikethrough only takes two characters '~~' */ + if (c == '~' || _isspace(data[1]) || (ret = parse_emph1(ob, rndr, data + 1, size - 1, c)) == 0) + return 0; + + return ret + 1; + } + + if (size > 3 && data[1] == c && data[2] != c) { + if (_isspace(data[2]) || (ret = parse_emph2(ob, rndr, data + 2, size - 2, c)) == 0) + return 0; + + return ret + 2; + } + + if (size > 4 && data[1] == c && data[2] == c && data[3] != c) { + if (c == '~' || _isspace(data[3]) || (ret = parse_emph3(ob, rndr, data + 3, size - 3, c)) == 0) + return 0; + + return ret + 3; + } + + return 0; +} + + +/* char_linebreak • '\n' preceded by two spaces (assuming linebreak != 0) */ +static size_t +char_linebreak(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + if (max_rewind < 2 || data[-1] != ' ' || data[-2] != ' ') + return 0; + + /* removing the last space from ob and rendering */ + while (ob->size && ob->data[ob->size - 1] == ' ') + ob->size--; + + return rndr->cb.linebreak(ob, rndr->opaque) ? 1 : 0; +} + + +/* char_codespan • '`' parsing a code span (assuming codespan != 0) */ +static size_t +char_codespan(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + size_t end, nb = 0, i, f_begin, f_end; + + /* counting the number of backticks in the delimiter */ + while (nb < size && data[nb] == '`') + nb++; + + /* finding the next delimiter */ + i = 0; + for (end = nb; end < size && i < nb; end++) { + if (data[end] == '`') i++; + else i = 0; + } + + if (i < nb && end >= size) + return 0; /* no matching delimiter */ + + /* trimming outside whitespaces */ + f_begin = nb; + while (f_begin < end && data[f_begin] == ' ') + f_begin++; + + f_end = end - nb; + while (f_end > nb && data[f_end-1] == ' ') + f_end--; + + /* real code span */ + if (f_begin < f_end) { + struct buf work = { data + f_begin, f_end - f_begin, 0, 0 }; + if (!rndr->cb.codespan(ob, &work, rndr->opaque)) + end = 0; + } else { + if (!rndr->cb.codespan(ob, 0, rndr->opaque)) + end = 0; + } + + return end; +} + + +/* char_escape • '\\' backslash escape */ +static size_t +char_escape(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + static const char *escape_chars = "\\`*_{}[]()#+-.!:|&<>/^~"; + struct buf work = { 0, 0, 0, 0 }; + + if (size > 1) { + if (strchr(escape_chars, data[1]) == NULL) + return 0; + + if (rndr->cb.normal_text) { + work.data = data + 1; + work.size = 1; + rndr->cb.normal_text(ob, &work, rndr->opaque); + } + else bufputc(ob, data[1]); + } else if (size == 1) { + bufputc(ob, data[0]); + } + + return 2; +} + +/* char_entity • '&' escaped when it doesn't belong to an entity */ +static size_t +char_entity(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + size_t end = 1; + size_t content_start; + size_t content_end; + struct buf work = { 0, 0, 0, 0 }; + int numeric = 0; + int hex = 0; + int entity_base; + uint32_t entity_val; + + if (end < size && data[end] == '#') { + numeric = 1; + end++; + } + + if (end < size && numeric && tolower(data[end]) == 'x') { + hex = 1; + end++; + } + + content_start = end; + + while (end < size) { + const char c = data[end]; + if (hex) { + if (!isxdigit(c)) break; + } else if (numeric) { + if (!isdigit(c)) break; + } else if (!isalnum(c)) { + break; + } + end++; + } + + content_end = end; + + if (end > content_start && end < size && data[end] == ';') + end++; /* well-formed entity */ + else + return 0; /* not an entity */ + + /* way too long to be a valid numeric entity */ + if (numeric && content_end - content_start > MAX_NUM_ENTITY_LEN) + return 0; + + /* Validate the entity's contents */ + if (numeric) { + if (hex) + entity_base = 16; + else + entity_base = 10; + + // This is ok because it'll stop once it hits the ';' + entity_val = strtol((char*)data + content_start, NULL, entity_base); + if (!is_valid_numeric_entity(entity_val)) + return 0; + } else { + if (!is_allowed_named_entity((const char *)data, end)) + return 0; + } + + if (rndr->cb.entity) { + work.data = data; + work.size = end; + rndr->cb.entity(ob, &work, rndr->opaque); + } else { + /* Necessary so we can normalize `>` to `>` */ + bufputc(ob, '&'); + if (numeric) + bufputc(ob, '#'); + if (hex) + bufputc(ob, 'x'); + bufput(ob, data + content_start, end - content_start); + } + + return end; +} + +/* char_langle_tag • '<' when tags or autolinks are allowed */ +static size_t +char_langle_tag(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + enum mkd_autolink altype = MKDA_NOT_AUTOLINK; + size_t end = tag_length(data, size, &altype); + struct buf work = { data, end, 0, 0 }; + int ret = 0; + + if (end > 2) { + if (rndr->cb.autolink && altype != MKDA_NOT_AUTOLINK) { + struct buf *u_link = rndr_newbuf(rndr, BUFFER_SPAN); + work.data = data + 1; + work.size = end - 2; + unscape_text(u_link, &work); + ret = rndr->cb.autolink(ob, u_link, altype, rndr->opaque); + rndr_popbuf(rndr, BUFFER_SPAN); + } + else if (rndr->cb.raw_html_tag) + ret = rndr->cb.raw_html_tag(ob, &work, rndr->opaque); + } + + if (!ret) return 0; + else return end; +} + +static size_t +char_autolink_www(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + struct buf *link, *link_url, *link_text; + size_t link_len, rewind; + + if (!rndr->cb.link || rndr->in_link_body) + return 0; + + link = rndr_newbuf(rndr, BUFFER_SPAN); + + if ((link_len = sd_autolink__www(&rewind, link, data, max_rewind, size, 0)) > 0) { + link_url = rndr_newbuf(rndr, BUFFER_SPAN); + BUFPUTSL(link_url, "http://"); + bufput(link_url, link->data, link->size); + + buftruncate(ob, ob->size - rewind); + if (rndr->cb.normal_text) { + link_text = rndr_newbuf(rndr, BUFFER_SPAN); + rndr->cb.normal_text(link_text, link, rndr->opaque); + rndr->cb.link(ob, link_url, NULL, link_text, rndr->opaque); + rndr_popbuf(rndr, BUFFER_SPAN); + } else { + rndr->cb.link(ob, link_url, NULL, link, rndr->opaque); + } + rndr_popbuf(rndr, BUFFER_SPAN); + } + + rndr_popbuf(rndr, BUFFER_SPAN); + return link_len; +} + +static size_t +char_autolink_subreddit_or_username(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + struct buf *link, *link_text, *link_url; + size_t link_len, rewind; + int no_slash; + + if (!rndr->cb.autolink || rndr->in_link_body) + return 0; + + link = rndr_newbuf(rndr, BUFFER_SPAN); + + link_len = sd_autolink__subreddit(&rewind, link, data, max_rewind, max_lookbehind, size, &no_slash); + if (link_len == 0) + link_len = sd_autolink__username(&rewind, link, data, max_rewind, max_lookbehind, size, &no_slash); + + /* Found either a user or subreddit link */ + if (link_len > 0) { + link_url = rndr_newbuf(rndr, BUFFER_SPAN); + if (no_slash) + bufputc(link_url, '/'); + bufput(link_url, link->data, link->size); + + buftruncate(ob, ob->size - rewind); + if (rndr->cb.normal_text) { + link_text = rndr_newbuf(rndr, BUFFER_SPAN); + rndr->cb.normal_text(link_text, link, rndr->opaque); + rndr->cb.link(ob, link_url, NULL, link_text, rndr->opaque); + rndr_popbuf(rndr, BUFFER_SPAN); + } else { + rndr->cb.link(ob, link_url, NULL, link, rndr->opaque); + } + rndr_popbuf(rndr, BUFFER_SPAN); + } + rndr_popbuf(rndr, BUFFER_SPAN); + + return link_len; +} + +static size_t +char_autolink_email(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + struct buf *link; + size_t link_len, rewind; + + if (!rndr->cb.autolink || rndr->in_link_body) + return 0; + + link = rndr_newbuf(rndr, BUFFER_SPAN); + + if ((link_len = sd_autolink__email(&rewind, link, data, max_rewind, size, 0)) > 0) { + buftruncate(ob, ob->size - rewind); + rndr->cb.autolink(ob, link, MKDA_EMAIL, rndr->opaque); + } + + rndr_popbuf(rndr, BUFFER_SPAN); + return link_len; +} + +static size_t +char_autolink_url(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + struct buf *link; + size_t link_len, rewind; + + if (!rndr->cb.autolink || rndr->in_link_body) + return 0; + + link = rndr_newbuf(rndr, BUFFER_SPAN); + + if ((link_len = sd_autolink__url(&rewind, link, data, max_rewind, size, 0)) > 0) { + buftruncate(ob, ob->size - rewind); + rndr->cb.autolink(ob, link, MKDA_NORMAL, rndr->opaque); + } + + rndr_popbuf(rndr, BUFFER_SPAN); + return link_len; +} + +/* char_link • '[': parsing a link or an image */ +static size_t +char_link(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + int is_img = (max_rewind && data[-1] == '!'), level; + size_t i = 1, txt_e, link_b = 0, link_e = 0, title_b = 0, title_e = 0; + struct buf *content = 0; + struct buf *link = 0; + struct buf *title = 0; + struct buf *u_link = 0; + size_t org_work_size = rndr->work_bufs[BUFFER_SPAN].size; + int text_has_nl = 0, ret = 0; + int in_title = 0, qtype = 0; + + /* checking whether the correct renderer exists */ + if ((is_img && !rndr->cb.image) || (!is_img && !rndr->cb.link)) + goto cleanup; + + /* looking for the matching closing bracket */ + for (level = 1; i < size; i++) { + if (data[i] == '\n') + text_has_nl = 1; + + else if (data[i - 1] == '\\') + continue; + + else if (data[i] == '[') + level++; + + else if (data[i] == ']') { + level--; + if (level <= 0) + break; + } + } + + if (i >= size) + goto cleanup; + + txt_e = i; + i++; + + /* skip any amount of whitespace or newline */ + /* (this is much more laxist than original markdown syntax) */ + while (i < size && _isspace(data[i])) + i++; + + /* inline style link */ + if (i < size && data[i] == '(') { + /* skipping initial whitespace */ + i++; + + while (i < size && _isspace(data[i])) + i++; + + link_b = i; + + /* looking for link end: ' " ) */ + while (i < size) { + if (data[i] == '\\') i += 2; + else if (data[i] == ')') break; + else if (i >= 1 && _isspace(data[i-1]) && (data[i] == '\'' || data[i] == '"')) break; + else i++; + } + + if (i >= size) goto cleanup; + link_e = i; + + /* looking for title end if present */ + if (data[i] == '\'' || data[i] == '"') { + qtype = data[i]; + in_title = 1; + i++; + title_b = i; + + while (i < size) { + if (data[i] == '\\') i += 2; + else if (data[i] == qtype) {in_title = 0; i++;} + else if ((data[i] == ')') && !in_title) break; + else i++; + } + + if (i >= size) goto cleanup; + + /* skipping whitespaces after title */ + title_e = i - 1; + while (title_e > title_b && _isspace(data[title_e])) + title_e--; + + /* checking for closing quote presence */ + if (data[title_e] != '\'' && data[title_e] != '"') { + title_b = title_e = 0; + link_e = i; + } + } + + /* remove whitespace at the end of the link */ + while (link_e > link_b && _isspace(data[link_e - 1])) + link_e--; + + /* remove optional angle brackets around the link */ + if (data[link_b] == '<') link_b++; + if (data[link_e - 1] == '>') link_e--; + + /* building escaped link and title */ + if (link_e > link_b) { + link = rndr_newbuf(rndr, BUFFER_SPAN); + bufput(link, data + link_b, link_e - link_b); + } + + if (title_e > title_b) { + title = rndr_newbuf(rndr, BUFFER_SPAN); + bufput(title, data + title_b, title_e - title_b); + } + + i++; + } + + /* reference style link */ + else if (i < size && data[i] == '[') { + struct buf id = { 0, 0, 0, 0 }; + struct link_ref *lr; + + /* looking for the id */ + i++; + link_b = i; + while (i < size && data[i] != ']') i++; + if (i >= size) goto cleanup; + link_e = i; + + /* finding the link_ref */ + if (link_b == link_e) { + if (text_has_nl) { + struct buf *b = rndr_newbuf(rndr, BUFFER_SPAN); + size_t j; + + for (j = 1; j < txt_e; j++) { + if (data[j] != '\n') + bufputc(b, data[j]); + else if (data[j - 1] != ' ') + bufputc(b, ' '); + } + + id.data = b->data; + id.size = b->size; + } else { + id.data = data + 1; + id.size = txt_e - 1; + } + } else { + id.data = data + link_b; + id.size = link_e - link_b; + } + + lr = find_link_ref(rndr->refs, id.data, id.size); + if (!lr) + goto cleanup; + + /* keeping link and title from link_ref */ + link = lr->link; + title = lr->title; + i++; + } + + /* shortcut reference style link */ + else { + struct buf id = { 0, 0, 0, 0 }; + struct link_ref *lr; + + /* crafting the id */ + if (text_has_nl) { + struct buf *b = rndr_newbuf(rndr, BUFFER_SPAN); + size_t j; + + for (j = 1; j < txt_e; j++) { + if (data[j] != '\n') + bufputc(b, data[j]); + else if (data[j - 1] != ' ') + bufputc(b, ' '); + } + + id.data = b->data; + id.size = b->size; + } else { + id.data = data + 1; + id.size = txt_e - 1; + } + + /* finding the link_ref */ + lr = find_link_ref(rndr->refs, id.data, id.size); + if (!lr) + goto cleanup; + + /* keeping link and title from link_ref */ + link = lr->link; + title = lr->title; + + /* rewinding the whitespace */ + i = txt_e + 1; + } + + /* building content: img alt is escaped, link content is parsed */ + if (txt_e > 1) { + content = rndr_newbuf(rndr, BUFFER_SPAN); + if (is_img) { + bufput(content, data + 1, txt_e - 1); + } else { + /* disable autolinking when parsing inline the + * content of a link */ + rndr->in_link_body = 1; + parse_inline(content, rndr, data + 1, txt_e - 1); + rndr->in_link_body = 0; + } + } + + if (link) { + u_link = rndr_newbuf(rndr, BUFFER_SPAN); + unscape_text(u_link, link); + } else { + goto cleanup; + } + + /* calling the relevant rendering function */ + if (is_img) { + if (ob->size && ob->data[ob->size - 1] == '!') + ob->size -= 1; + + ret = rndr->cb.image(ob, u_link, title, content, rndr->opaque); + } else { + ret = rndr->cb.link(ob, u_link, title, content, rndr->opaque); + } + + /* cleanup */ +cleanup: + rndr->work_bufs[BUFFER_SPAN].size = (int)org_work_size; + return ret ? i : 0; +} + +static size_t +char_superscript(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t max_rewind, size_t max_lookbehind, size_t size) +{ + size_t sup_start, sup_len; + struct buf *sup; + + if (!rndr->cb.superscript) + return 0; + + if (size < 2) + return 0; + + if (data[1] == '(') { + sup_start = sup_len = 2; + + while (sup_len < size && data[sup_len] != ')' && data[sup_len - 1] != '\\') + sup_len++; + + if (sup_len == size) + return 0; + } else { + sup_start = sup_len = 1; + + while (sup_len < size && !_isspace(data[sup_len])) + sup_len++; + } + + if (sup_len - sup_start == 0) + return (sup_start == 2) ? 3 : 0; + + sup = rndr_newbuf(rndr, BUFFER_SPAN); + parse_inline(sup, rndr, data + sup_start, sup_len - sup_start); + rndr->cb.superscript(ob, sup, rndr->opaque); + rndr_popbuf(rndr, BUFFER_SPAN); + + return (sup_start == 2) ? sup_len + 1 : sup_len; +} + +/********************************* + * BLOCK-LEVEL PARSING FUNCTIONS * + *********************************/ + +/* is_empty • returns the line length when it is empty, 0 otherwise */ +static size_t +is_empty(uint8_t *data, size_t size) +{ + size_t i; + + for (i = 0; i < size && data[i] != '\n'; i++) + if (data[i] != ' ') + return 0; + + return i + 1; +} + +/* is_hrule • returns whether a line is a horizontal rule */ +static int +is_hrule(uint8_t *data, size_t size) +{ + size_t i = 0, n = 0; + uint8_t c; + + /* skipping initial spaces */ + if (size < 3) return 0; + if (data[0] == ' ') { i++; + if (data[1] == ' ') { i++; + if (data[2] == ' ') { i++; } } } + + /* looking at the hrule uint8_t */ + if (i + 2 >= size + || (data[i] != '*' && data[i] != '-' && data[i] != '_')) + return 0; + c = data[i]; + + /* the whole line must be the char or whitespace */ + while (i < size && data[i] != '\n') { + if (data[i] == c) n++; + else if (data[i] != ' ') + return 0; + + i++; + } + + return n >= 3; +} + +/* check if a line begins with a code fence; return the + * width of the code fence */ +static size_t +prefix_codefence(uint8_t *data, size_t size) +{ + size_t i = 0, n = 0; + uint8_t c; + + /* skipping initial spaces */ + if (size < 3) return 0; + if (data[0] == ' ') { i++; + if (data[1] == ' ') { i++; + if (data[2] == ' ') { i++; } } } + + /* looking at the hrule uint8_t */ + if (i + 2 >= size || !(data[i] == '~' || data[i] == '`')) + return 0; + + c = data[i]; + + /* the whole line must be the uint8_t or whitespace */ + while (i < size && data[i] == c) { + n++; i++; + } + + if (n < 3) + return 0; + + return i; +} + +/* check if a line is a code fence; return its size if it is */ +static size_t +is_codefence(uint8_t *data, size_t size, struct buf *syntax) +{ + size_t i = 0, syn_len = 0; + uint8_t *syn_start; + + i = prefix_codefence(data, size); + if (i == 0) + return 0; + + while (i < size && data[i] == ' ') + i++; + + syn_start = data + i; + + if (i < size && data[i] == '{') { + i++; syn_start++; + + while (i < size && data[i] != '}' && data[i] != '\n') { + syn_len++; i++; + } + + if (i == size || data[i] != '}') + return 0; + + /* strip all whitespace at the beginning and the end + * of the {} block */ + while (syn_len > 0 && _isspace(syn_start[0])) { + syn_start++; syn_len--; + } + + while (syn_len > 0 && _isspace(syn_start[syn_len - 1])) + syn_len--; + + i++; + } else { + while (i < size && !_isspace(data[i])) { + syn_len++; i++; + } + } + + if (syntax) { + syntax->data = syn_start; + syntax->size = syn_len; + } + + while (i < size && data[i] != '\n') { + if (!_isspace(data[i])) + return 0; + + i++; + } + + return i + 1; +} + +/* is_atxheader • returns whether the line is a hash-prefixed header */ +static int +is_atxheader(struct sd_markdown *rndr, uint8_t *data, size_t size) +{ + if (data[0] != '#') + return 0; + + if (rndr->ext_flags & MKDEXT_SPACE_HEADERS) { + size_t level = 0; + + while (level < size && level < 6 && data[level] == '#') + level++; + + if (level < size && data[level] != ' ') + return 0; + } + + return 1; +} + +/* is_headerline • returns whether the line is a setext-style hdr underline */ +static int +is_headerline(uint8_t *data, size_t size) +{ + size_t i = 0; + + /* test of level 1 header */ + if (data[i] == '=') { + for (i = 1; i < size && data[i] == '='; i++); + while (i < size && data[i] == ' ') i++; + return (i >= size || data[i] == '\n') ? 1 : 0; } + + /* test of level 2 header */ + if (data[i] == '-') { + for (i = 1; i < size && data[i] == '-'; i++); + while (i < size && data[i] == ' ') i++; + return (i >= size || data[i] == '\n') ? 2 : 0; } + + return 0; +} + +static int +is_next_headerline(uint8_t *data, size_t size) +{ + size_t i = 0; + + while (i < size && data[i] != '\n') + i++; + + if (++i >= size) + return 0; + + return is_headerline(data + i, size - i); +} + +/* prefix_quote • returns blockquote prefix length */ +static size_t +prefix_quote(uint8_t *data, size_t size) +{ + size_t i = 0; + if (i < size && data[i] == ' ') i++; + if (i < size && data[i] == ' ') i++; + if (i < size && data[i] == ' ') i++; + + if (i < size && data[i] == '>') { + if (i + 1 < size && data[i + 1] == ' ') + return i + 2; + + return i + 1; + } + + return 0; +} + +/* prefix_code • returns prefix length for block code*/ +static size_t +prefix_code(uint8_t *data, size_t size) +{ + if (size > 3 && data[0] == ' ' && data[1] == ' ' + && data[2] == ' ' && data[3] == ' ') return 4; + + return 0; +} + +/* prefix_oli • returns ordered list item prefix */ +static size_t +prefix_oli(uint8_t *data, size_t size) +{ + size_t i = 0; + + if (i < size && data[i] == ' ') i++; + if (i < size && data[i] == ' ') i++; + if (i < size && data[i] == ' ') i++; + + if (i >= size || data[i] < '0' || data[i] > '9') + return 0; + + while (i < size && data[i] >= '0' && data[i] <= '9') + i++; + + if (i + 1 >= size || data[i] != '.' || data[i + 1] != ' ') + return 0; + + if (is_next_headerline(data + i, size - i)) + return 0; + + return i + 2; +} + +/* prefix_uli • returns ordered list item prefix */ +static size_t +prefix_uli(uint8_t *data, size_t size) +{ + size_t i = 0; + + if (i < size && data[i] == ' ') i++; + if (i < size && data[i] == ' ') i++; + if (i < size && data[i] == ' ') i++; + + if (i + 1 >= size || + (data[i] != '*' && data[i] != '+' && data[i] != '-') || + data[i + 1] != ' ') + return 0; + + if (is_next_headerline(data + i, size - i)) + return 0; + + return i + 2; +} + + +/* parse_block • parsing of one block, returning next uint8_t to parse */ +static void parse_block(struct buf *ob, struct sd_markdown *rndr, + uint8_t *data, size_t size); + + +/* parse_blockquote • handles parsing of a blockquote fragment */ +static size_t +parse_blockquote(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size) +{ + size_t beg, end = 0, pre, work_size = 0; + uint8_t *work_data = 0; + struct buf *out = 0; + + out = rndr_newbuf(rndr, BUFFER_BLOCK); + beg = 0; + while (beg < size) { + for (end = beg + 1; end < size && data[end - 1] != '\n'; end++); + + pre = prefix_quote(data + beg, end - beg); + + if (pre) + beg += pre; /* skipping prefix */ + + /* empty line followed by non-quote line */ + else if (is_empty(data + beg, end - beg) && + (end >= size || (prefix_quote(data + end, size - end) == 0 && + !is_empty(data + end, size - end)))) + break; + + if (beg < end) { /* copy into the in-place working buffer */ + /* bufput(work, data + beg, end - beg); */ + if (!work_data) + work_data = data + beg; + else if (data + beg != work_data + work_size) + memmove(work_data + work_size, data + beg, end - beg); + work_size += end - beg; + } + beg = end; + } + + parse_block(out, rndr, work_data, work_size); + if (rndr->cb.blockquote) + rndr->cb.blockquote(ob, out, rndr->opaque); + rndr_popbuf(rndr, BUFFER_BLOCK); + return end; +} + +static size_t +parse_htmlblock(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size, int do_render); + +/* parse_blockquote • handles parsing of a regular paragraph */ +static size_t +parse_paragraph(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size) +{ + size_t i = 0, end = 0; + int level = 0; + struct buf work = { data, 0, 0, 0 }; + + while (i < size) { + for (end = i + 1; end < size && data[end - 1] != '\n'; end++) /* empty */; + + if (prefix_quote(data + i, end - i) != 0) { + end = i; + break; + } + + if (is_empty(data + i, size - i)) + break; + + if ((level = is_headerline(data + i, size - i)) != 0) + break; + + if (is_atxheader(rndr, data + i, size - i) || + is_hrule(data + i, size - i) || + prefix_quote(data + i, size - i)) { + end = i; + break; + } + + /* + * Early termination of a paragraph with the same logic + * as Markdown 1.0.0. If this logic is applied, the + * Markdown 1.0.3 test suite won't pass cleanly + * + * :: If the first character in a new line is not a letter, + * let's check to see if there's some kind of block starting + * here + */ + if ((rndr->ext_flags & MKDEXT_LAX_SPACING) && !isalnum(data[i])) { + if (prefix_oli(data + i, size - i) || + prefix_uli(data + i, size - i)) { + end = i; + break; + } + + /* see if an html block starts here */ + if (data[i] == '<' && rndr->cb.blockhtml && + parse_htmlblock(ob, rndr, data + i, size - i, 0)) { + end = i; + break; + } + + /* see if a code fence starts here */ + if ((rndr->ext_flags & MKDEXT_FENCED_CODE) != 0 && + is_codefence(data + i, size - i, NULL) != 0) { + end = i; + break; + } + } + + i = end; + } + + work.size = i; + while (work.size && data[work.size - 1] == '\n') + work.size--; + + if (!level) { + struct buf *tmp = rndr_newbuf(rndr, BUFFER_BLOCK); + parse_inline(tmp, rndr, work.data, work.size); + if (rndr->cb.paragraph) + rndr->cb.paragraph(ob, tmp, rndr->opaque); + rndr_popbuf(rndr, BUFFER_BLOCK); + } else { + struct buf *header_work; + + if (work.size) { + size_t beg; + i = work.size; + work.size -= 1; + + while (work.size && data[work.size] != '\n') + work.size -= 1; + + beg = work.size + 1; + while (work.size && data[work.size - 1] == '\n') + work.size -= 1; + + if (work.size > 0) { + struct buf *tmp = rndr_newbuf(rndr, BUFFER_BLOCK); + parse_inline(tmp, rndr, work.data, work.size); + + if (rndr->cb.paragraph) + rndr->cb.paragraph(ob, tmp, rndr->opaque); + + rndr_popbuf(rndr, BUFFER_BLOCK); + work.data += beg; + work.size = i - beg; + } + else work.size = i; + } + + header_work = rndr_newbuf(rndr, BUFFER_SPAN); + parse_inline(header_work, rndr, work.data, work.size); + + if (rndr->cb.header) + rndr->cb.header(ob, header_work, (int)level, rndr->opaque); + + rndr_popbuf(rndr, BUFFER_SPAN); + } + + return end; +} + +/* parse_fencedcode • handles parsing of a block-level code fragment */ +static size_t +parse_fencedcode(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size) +{ + size_t beg, end; + struct buf *work = 0; + struct buf lang = { 0, 0, 0, 0 }; + + beg = is_codefence(data, size, &lang); + if (beg == 0) return 0; + + work = rndr_newbuf(rndr, BUFFER_BLOCK); + + while (beg < size) { + size_t fence_end; + struct buf fence_trail = { 0, 0, 0, 0 }; + + fence_end = is_codefence(data + beg, size - beg, &fence_trail); + if (fence_end != 0 && fence_trail.size == 0) { + beg += fence_end; + break; + } + + for (end = beg + 1; end < size && data[end - 1] != '\n'; end++); + + if (beg < end) { + /* verbatim copy to the working buffer, + escaping entities */ + if (is_empty(data + beg, end - beg)) + bufputc(work, '\n'); + else bufput(work, data + beg, end - beg); + } + beg = end; + } + + if (work->size && work->data[work->size - 1] != '\n') + bufputc(work, '\n'); + + if (rndr->cb.blockcode) + rndr->cb.blockcode(ob, work, lang.size ? &lang : NULL, rndr->opaque); + + rndr_popbuf(rndr, BUFFER_BLOCK); + return beg; +} + +static size_t +parse_blockcode(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size) +{ + size_t beg, end, pre; + struct buf *work = 0; + + work = rndr_newbuf(rndr, BUFFER_BLOCK); + + beg = 0; + while (beg < size) { + for (end = beg + 1; end < size && data[end - 1] != '\n'; end++) {}; + pre = prefix_code(data + beg, end - beg); + + if (pre) + beg += pre; /* skipping prefix */ + else if (!is_empty(data + beg, end - beg)) + /* non-empty non-prefixed line breaks the pre */ + break; + + if (beg < end) { + /* verbatim copy to the working buffer, + escaping entities */ + if (is_empty(data + beg, end - beg)) + bufputc(work, '\n'); + else bufput(work, data + beg, end - beg); + } + beg = end; + } + + while (work->size && work->data[work->size - 1] == '\n') + work->size -= 1; + + bufputc(work, '\n'); + + if (rndr->cb.blockcode) + rndr->cb.blockcode(ob, work, NULL, rndr->opaque); + + rndr_popbuf(rndr, BUFFER_BLOCK); + return beg; +} + +/* parse_listitem • parsing of a single list item */ +/* assuming initial prefix is already removed */ +static size_t +parse_listitem(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size, int *flags) +{ + struct buf *work = 0, *inter = 0; + size_t beg = 0, end, pre, sublist = 0, orgpre = 0, i; + int in_empty = 0, has_inside_empty = 0, in_fence = 0; + + /* keeping track of the first indentation prefix */ + while (orgpre < 3 && orgpre < size && data[orgpre] == ' ') + orgpre++; + + beg = prefix_uli(data, size); + if (!beg) + beg = prefix_oli(data, size); + + if (!beg) + return 0; + + /* skipping to the beginning of the following line */ + end = beg; + while (end < size && data[end - 1] != '\n') + end++; + + /* getting working buffers */ + work = rndr_newbuf(rndr, BUFFER_SPAN); + inter = rndr_newbuf(rndr, BUFFER_SPAN); + + /* putting the first line into the working buffer */ + bufput(work, data + beg, end - beg); + beg = end; + + /* process the following lines */ + while (beg < size) { + size_t has_next_uli = 0, has_next_oli = 0; + + end++; + + while (end < size && data[end - 1] != '\n') + end++; + + /* process an empty line */ + if (is_empty(data + beg, end - beg)) { + in_empty = 1; + beg = end; + continue; + } + + /* calculating the indentation */ + i = 0; + while (i < 4 && beg + i < end && data[beg + i] == ' ') + i++; + + pre = i; + + if (rndr->ext_flags & MKDEXT_FENCED_CODE) { + if (is_codefence(data + beg + i, end - beg - i, NULL) != 0) + in_fence = !in_fence; + } + + /* Only check for new list items if we are **not** inside + * a fenced code block */ + if (!in_fence) { + has_next_uli = prefix_uli(data + beg + i, end - beg - i); + has_next_oli = prefix_oli(data + beg + i, end - beg - i); + } + + /* checking for ul/ol switch */ + if (in_empty && ( + ((*flags & MKD_LIST_ORDERED) && has_next_uli) || + (!(*flags & MKD_LIST_ORDERED) && has_next_oli))){ + *flags |= MKD_LI_END; + break; /* the following item must have same list type */ + } + + /* checking for a new item */ + if ((has_next_uli && !is_hrule(data + beg + i, end - beg - i)) || has_next_oli) { + if (in_empty) + has_inside_empty = 1; + + if (pre == orgpre) /* the following item must have */ + break; /* the same indentation */ + + if (!sublist) + sublist = work->size; + } + /* joining only indented stuff after empty lines; + * note that now we only require 1 space of indentation + * to continue a list */ + else if (in_empty && pre == 0) { + *flags |= MKD_LI_END; + break; + } + else if (in_empty) { + bufputc(work, '\n'); + has_inside_empty = 1; + } + + in_empty = 0; + + /* adding the line without prefix into the working buffer */ + bufput(work, data + beg + i, end - beg - i); + beg = end; + } + + /* render of li contents */ + if (has_inside_empty) + *flags |= MKD_LI_BLOCK; + + if (*flags & MKD_LI_BLOCK) { + /* intermediate render of block li */ + if (sublist && sublist < work->size) { + parse_block(inter, rndr, work->data, sublist); + parse_block(inter, rndr, work->data + sublist, work->size - sublist); + } + else + parse_block(inter, rndr, work->data, work->size); + } else { + /* intermediate render of inline li */ + if (sublist && sublist < work->size) { + parse_inline(inter, rndr, work->data, sublist); + parse_block(inter, rndr, work->data + sublist, work->size - sublist); + } + else + parse_inline(inter, rndr, work->data, work->size); + } + + /* render of li itself */ + if (rndr->cb.listitem) + rndr->cb.listitem(ob, inter, *flags, rndr->opaque); + + rndr_popbuf(rndr, BUFFER_SPAN); + rndr_popbuf(rndr, BUFFER_SPAN); + return beg; +} + + +/* parse_list • parsing ordered or unordered list block */ +static size_t +parse_list(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size, int flags) +{ + struct buf *work = 0; + size_t i = 0, j; + + work = rndr_newbuf(rndr, BUFFER_BLOCK); + + while (i < size) { + j = parse_listitem(work, rndr, data + i, size - i, &flags); + i += j; + + if (!j || (flags & MKD_LI_END)) + break; + } + + if (rndr->cb.list) + rndr->cb.list(ob, work, flags, rndr->opaque); + rndr_popbuf(rndr, BUFFER_BLOCK); + return i; +} + +/* parse_atxheader • parsing of atx-style headers */ +static size_t +parse_atxheader(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size) +{ + size_t level = 0; + size_t i, end, skip; + + while (level < size && level < 6 && data[level] == '#') + level++; + + for (i = level; i < size && data[i] == ' '; i++); + + for (end = i; end < size && data[end] != '\n'; end++); + skip = end; + + while (end && data[end - 1] == '#') + end--; + + while (end && data[end - 1] == ' ') + end--; + + if (end > i) { + struct buf *work = rndr_newbuf(rndr, BUFFER_SPAN); + + parse_inline(work, rndr, data + i, end - i); + + if (rndr->cb.header) + rndr->cb.header(ob, work, (int)level, rndr->opaque); + + rndr_popbuf(rndr, BUFFER_SPAN); + } + + return skip; +} + + +/* htmlblock_end • checking end of HTML block : [ \t]*\n[ \t*]\n */ +/* returns the length on match, 0 otherwise */ +static size_t +htmlblock_end_tag( + const char *tag, + size_t tag_len, + struct sd_markdown *rndr, + uint8_t *data, + size_t size) +{ + size_t i, w; + + /* checking if tag is a match */ + if (tag_len + 3 >= size || + strncasecmp((char *)data + 2, tag, tag_len) != 0 || + data[tag_len + 2] != '>') + return 0; + + /* checking white lines */ + i = tag_len + 3; + w = 0; + if (i < size && (w = is_empty(data + i, size - i)) == 0) + return 0; /* non-blank after tag */ + i += w; + w = 0; + + if (i < size) + w = is_empty(data + i, size - i); + + return i + w; +} + +static size_t +htmlblock_end(const char *curtag, + struct sd_markdown *rndr, + uint8_t *data, + size_t size, + int start_of_line) +{ + size_t tag_size = strlen(curtag); + size_t i = 1, end_tag; + int block_lines = 0; + + while (i < size) { + i++; + while (i < size && !(data[i - 1] == '<' && data[i] == '/')) { + if (data[i] == '\n') + block_lines++; + + i++; + } + + /* If we are only looking for unindented tags, skip the tag + * if it doesn't follow a newline. + * + * The only exception to this is if the tag is still on the + * initial line; in that case it still counts as a closing + * tag + */ + if (start_of_line && block_lines > 0 && data[i - 2] != '\n') + continue; + + if (i + 2 + tag_size >= size) + break; + + end_tag = htmlblock_end_tag(curtag, tag_size, rndr, data + i - 1, size - i + 1); + if (end_tag) + return i + end_tag - 1; + } + + return 0; +} + + +/* parse_htmlblock • parsing of inline HTML block */ +static size_t +parse_htmlblock(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size, int do_render) +{ + size_t i, j = 0, tag_end; + const char *curtag = NULL; + struct buf work = { data, 0, 0, 0 }; + + /* identification of the opening tag */ + if (size < 2 || data[0] != '<') + return 0; + + i = 1; + while (i < size && data[i] != '>' && data[i] != ' ') + i++; + + if (i < size) + curtag = find_block_tag((char *)data + 1, (int)i - 1); + + /* handling of special cases */ + if (!curtag) { + + /* HTML comment, laxist form */ + if (size > 5 && data[1] == '!' && data[2] == '-' && data[3] == '-') { + i = 5; + + while (i < size && !(data[i - 2] == '-' && data[i - 1] == '-' && data[i] == '>')) + i++; + + i++; + + if (i < size) + j = is_empty(data + i, size - i); + + if (j) { + work.size = i + j; + if (do_render && rndr->cb.blockhtml) + rndr->cb.blockhtml(ob, &work, rndr->opaque); + return work.size; + } + } + + /* HR, which is the only self-closing block tag considered */ + if (size > 4 && (data[1] == 'h' || data[1] == 'H') && (data[2] == 'r' || data[2] == 'R')) { + i = 3; + while (i < size && data[i] != '>') + i++; + + if (i + 1 < size) { + i++; + j = is_empty(data + i, size - i); + if (j) { + work.size = i + j; + if (do_render && rndr->cb.blockhtml) + rndr->cb.blockhtml(ob, &work, rndr->opaque); + return work.size; + } + } + } + + /* no special case recognised */ + return 0; + } + + /* looking for an unindented matching closing tag */ + /* followed by a blank line */ + tag_end = htmlblock_end(curtag, rndr, data, size, 1); + + /* if not found, trying a second pass looking for indented match */ + /* but not if tag is "ins" or "del" (following original Markdown.pl) */ + if (!tag_end && strcmp(curtag, "ins") != 0 && strcmp(curtag, "del") != 0) { + tag_end = htmlblock_end(curtag, rndr, data, size, 0); + } + + if (!tag_end) + return 0; + + /* the end of the block has been found */ + work.size = tag_end; + if (do_render && rndr->cb.blockhtml) + rndr->cb.blockhtml(ob, &work, rndr->opaque); + + return tag_end; +} + +static void +parse_table_row( + struct buf *ob, + struct sd_markdown *rndr, + uint8_t *data, + size_t size, + size_t columns, + int *col_data, + int header_flag) +{ + size_t i = 0, col, cols_left; + struct buf *row_work = 0; + + if (!rndr->cb.table_cell || !rndr->cb.table_row) + return; + + row_work = rndr_newbuf(rndr, BUFFER_SPAN); + + if (i < size && data[i] == '|') + i++; + + for (col = 0; col < columns && i < size; ++col) { + size_t cell_start, cell_end; + struct buf *cell_work; + + cell_work = rndr_newbuf(rndr, BUFFER_SPAN); + + while (i < size && _isspace(data[i])) + i++; + + cell_start = i; + + while (i < size && data[i] != '|') + i++; + + cell_end = i - 1; + + while (cell_end > cell_start && _isspace(data[cell_end])) + cell_end--; + + parse_inline(cell_work, rndr, data + cell_start, 1 + cell_end - cell_start); + rndr->cb.table_cell(row_work, cell_work, col_data[col] | header_flag, rndr->opaque, 0); + + rndr_popbuf(rndr, BUFFER_SPAN); + i++; + } + + cols_left = columns - col; + if (cols_left > 0) { + struct buf empty_cell = { 0, 0, 0, 0 }; + rndr->cb.table_cell(row_work, &empty_cell, col_data[col] | header_flag, rndr->opaque, cols_left); + } + + rndr->cb.table_row(ob, row_work, rndr->opaque); + + rndr_popbuf(rndr, BUFFER_SPAN); +} + +static size_t +parse_table_header( + struct buf *ob, + struct sd_markdown *rndr, + uint8_t *data, + size_t size, + size_t *columns, + int **column_data) +{ + int pipes; + size_t i = 0, col, header_end, under_end; + + pipes = 0; + while (i < size && data[i] != '\n') + if (data[i++] == '|') + pipes++; + + if (i == size || pipes == 0) + return 0; + + header_end = i; + + while (header_end > 0 && _isspace(data[header_end - 1])) + header_end--; + + if (data[0] == '|') + pipes--; + + if (header_end && data[header_end - 1] == '|') + pipes--; + + if (pipes + 1 > rndr->max_table_cols) + return 0; + + *columns = pipes + 1; + *column_data = calloc(*columns, sizeof(int)); + + /* Parse the header underline */ + i++; + if (i < size && data[i] == '|') + i++; + + under_end = i; + while (under_end < size && data[under_end] != '\n') + under_end++; + + for (col = 0; col < *columns && i < under_end; ++col) { + size_t dashes = 0; + + while (i < under_end && data[i] == ' ') + i++; + + if (data[i] == ':') { + i++; (*column_data)[col] |= MKD_TABLE_ALIGN_L; + dashes++; + } + + while (i < under_end && data[i] == '-') { + i++; dashes++; + } + + if (i < under_end && data[i] == ':') { + i++; (*column_data)[col] |= MKD_TABLE_ALIGN_R; + dashes++; + } + + while (i < under_end && data[i] == ' ') + i++; + + if (i < under_end && data[i] != '|') + break; + + if (dashes < 1) + break; + + i++; + } + + if (col < *columns) + return 0; + + parse_table_row( + ob, rndr, data, + header_end, + *columns, + *column_data, + MKD_TABLE_HEADER + ); + + return under_end + 1; +} + +static size_t +parse_table( + struct buf *ob, + struct sd_markdown *rndr, + uint8_t *data, + size_t size) +{ + size_t i; + + struct buf *header_work = 0; + struct buf *body_work = 0; + + size_t columns; + int *col_data = NULL; + + header_work = rndr_newbuf(rndr, BUFFER_SPAN); + body_work = rndr_newbuf(rndr, BUFFER_BLOCK); + + i = parse_table_header(header_work, rndr, data, size, &columns, &col_data); + if (i > 0) { + + while (i < size) { + size_t row_start; + int pipes = 0; + + row_start = i; + + while (i < size && data[i] != '\n') + if (data[i++] == '|') + pipes++; + + if (pipes == 0 || i == size) { + i = row_start; + break; + } + + parse_table_row( + body_work, + rndr, + data + row_start, + i - row_start, + columns, + col_data, 0 + ); + + i++; + } + + if (rndr->cb.table) + rndr->cb.table(ob, header_work, body_work, rndr->opaque); + } + + free(col_data); + rndr_popbuf(rndr, BUFFER_SPAN); + rndr_popbuf(rndr, BUFFER_BLOCK); + return i; +} + +/* parse_block • parsing of one block, returning next uint8_t to parse */ +static void +parse_block(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t size) +{ + size_t beg, end, i; + uint8_t *txt_data; + beg = 0; + + if (rndr->work_bufs[BUFFER_SPAN].size + + rndr->work_bufs[BUFFER_BLOCK].size > rndr->max_nesting) + return; + + while (beg < size) { + txt_data = data + beg; + end = size - beg; + + if (is_atxheader(rndr, txt_data, end)) + beg += parse_atxheader(ob, rndr, txt_data, end); + + else if (data[beg] == '<' && rndr->cb.blockhtml && + (i = parse_htmlblock(ob, rndr, txt_data, end, 1)) != 0) + beg += i; + + else if ((i = is_empty(txt_data, end)) != 0) + beg += i; + + else if (is_hrule(txt_data, end)) { + if (rndr->cb.hrule) + rndr->cb.hrule(ob, rndr->opaque); + + while (beg < size && data[beg] != '\n') + beg++; + + beg++; + } + + else if ((rndr->ext_flags & MKDEXT_FENCED_CODE) != 0 && + (i = parse_fencedcode(ob, rndr, txt_data, end)) != 0) + beg += i; + + else if ((rndr->ext_flags & MKDEXT_TABLES) != 0 && + (i = parse_table(ob, rndr, txt_data, end)) != 0) + beg += i; + + else if (prefix_quote(txt_data, end)) + beg += parse_blockquote(ob, rndr, txt_data, end); + + else if (prefix_code(txt_data, end)) + beg += parse_blockcode(ob, rndr, txt_data, end); + + else if (prefix_uli(txt_data, end)) + beg += parse_list(ob, rndr, txt_data, end, 0); + + else if (prefix_oli(txt_data, end)) + beg += parse_list(ob, rndr, txt_data, end, MKD_LIST_ORDERED); + + else + beg += parse_paragraph(ob, rndr, txt_data, end); + } +} + + + +/********************* + * REFERENCE PARSING * + *********************/ + +/* is_ref • returns whether a line is a reference or not */ +static int +is_ref(const uint8_t *data, size_t beg, size_t end, size_t *last, struct link_ref **refs) +{ +/* int n; */ + size_t i = 0; + size_t id_offset, id_end; + size_t link_offset, link_end; + size_t title_offset, title_end; + size_t line_end; + + /* up to 3 optional leading spaces */ + if (beg + 3 >= end) return 0; + if (data[beg] == ' ') { i = 1; + if (data[beg + 1] == ' ') { i = 2; + if (data[beg + 2] == ' ') { i = 3; + if (data[beg + 3] == ' ') return 0; } } } + i += beg; + + /* id part: anything but a newline between brackets */ + if (data[i] != '[') return 0; + i++; + id_offset = i; + while (i < end && data[i] != '\n' && data[i] != '\r' && data[i] != ']') + i++; + if (i >= end || data[i] != ']') return 0; + id_end = i; + + /* spacer: colon (space | tab)* newline? (space | tab)* */ + i++; + if (i >= end || data[i] != ':') return 0; + i++; + while (i < end && data[i] == ' ') i++; + if (i < end && (data[i] == '\n' || data[i] == '\r')) { + i++; + if (i < end && data[i] == '\r' && data[i - 1] == '\n') i++; } + while (i < end && data[i] == ' ') i++; + if (i >= end) return 0; + + /* link: whitespace-free sequence, optionally between angle brackets */ + if (data[i] == '<') + i++; + + link_offset = i; + + while (i < end && data[i] != ' ' && data[i] != '\n' && data[i] != '\r') + i++; + + if (data[i - 1] == '>') link_end = i - 1; + else link_end = i; + + /* optional spacer: (space | tab)* (newline | '\'' | '"' | '(' ) */ + while (i < end && data[i] == ' ') i++; + if (i < end && data[i] != '\n' && data[i] != '\r' + && data[i] != '\'' && data[i] != '"' && data[i] != '(') + return 0; + line_end = 0; + /* computing end-of-line */ + if (i >= end || data[i] == '\r' || data[i] == '\n') line_end = i; + if (i + 1 < end && data[i] == '\n' && data[i + 1] == '\r') + line_end = i + 1; + + /* optional (space|tab)* spacer after a newline */ + if (line_end) { + i = line_end + 1; + while (i < end && data[i] == ' ') i++; } + + /* optional title: any non-newline sequence enclosed in '"() + alone on its line */ + title_offset = title_end = 0; + if (i + 1 < end + && (data[i] == '\'' || data[i] == '"' || data[i] == '(')) { + i++; + title_offset = i; + /* looking for EOL */ + while (i < end && data[i] != '\n' && data[i] != '\r') i++; + if (i + 1 < end && data[i] == '\n' && data[i + 1] == '\r') + title_end = i + 1; + else title_end = i; + /* stepping back */ + i -= 1; + while (i > title_offset && data[i] == ' ') + i -= 1; + if (i > title_offset + && (data[i] == '\'' || data[i] == '"' || data[i] == ')')) { + line_end = title_end; + title_end = i; } } + + if (!line_end || link_end == link_offset) + return 0; /* garbage after the link empty link */ + + /* a valid ref has been found, filling-in return structures */ + if (last) + *last = line_end; + + if (refs) { + struct link_ref *ref; + + ref = add_link_ref(refs, data + id_offset, id_end - id_offset); + if (!ref) + return 0; + + ref->link = bufnew(link_end - link_offset); + bufput(ref->link, data + link_offset, link_end - link_offset); + + if (title_end > title_offset) { + ref->title = bufnew(title_end - title_offset); + bufput(ref->title, data + title_offset, title_end - title_offset); + } + } + + return 1; +} + +static void expand_tabs(struct buf *ob, const uint8_t *line, size_t size) +{ + size_t i = 0, tab = 0; + + while (i < size) { + size_t org = i; + + while (i < size && line[i] != '\t') { + i++; tab++; + } + + if (i > org) + bufput(ob, line + org, i - org); + + if (i >= size) + break; + + do { + bufputc(ob, ' '); tab++; + } while (tab % 4); + + i++; + } +} + +/********************** + * EXPORTED FUNCTIONS * + **********************/ + +struct sd_markdown * +sd_markdown_new( + unsigned int extensions, + size_t max_nesting, + size_t max_table_cols, + const struct sd_callbacks *callbacks, + void *opaque) +{ + struct sd_markdown *md = NULL; + + assert(max_nesting > 0 && max_table_cols > 0 && callbacks); + + md = malloc(sizeof(struct sd_markdown)); + if (!md) + return NULL; + + memcpy(&md->cb, callbacks, sizeof(struct sd_callbacks)); + + stack_init(&md->work_bufs[BUFFER_BLOCK], 4); + stack_init(&md->work_bufs[BUFFER_SPAN], 8); + + memset(md->active_char, 0x0, 256); + + if (md->cb.emphasis || md->cb.double_emphasis || md->cb.triple_emphasis) { + md->active_char['*'] = MD_CHAR_EMPHASIS; + md->active_char['_'] = MD_CHAR_EMPHASIS; + if (extensions & MKDEXT_STRIKETHROUGH) + md->active_char['~'] = MD_CHAR_EMPHASIS; + } + + if (md->cb.codespan) + md->active_char['`'] = MD_CHAR_CODESPAN; + + if (md->cb.linebreak) + md->active_char['\n'] = MD_CHAR_LINEBREAK; + + if (md->cb.image || md->cb.link) + md->active_char['['] = MD_CHAR_LINK; + + md->active_char['<'] = MD_CHAR_LANGLE; + md->active_char['\\'] = MD_CHAR_ESCAPE; + md->active_char['&'] = MD_CHAR_ENTITITY; + + if (extensions & MKDEXT_AUTOLINK) { + if (!(extensions & MKDEXT_NO_EMAIL_AUTOLINK)) + md->active_char['@'] = MD_CHAR_AUTOLINK_EMAIL; + md->active_char[':'] = MD_CHAR_AUTOLINK_URL; + md->active_char['w'] = MD_CHAR_AUTOLINK_WWW; + md->active_char['/'] = MD_CHAR_AUTOLINK_SUBREDDIT_OR_USERNAME; + } + + if (extensions & MKDEXT_SUPERSCRIPT) + md->active_char['^'] = MD_CHAR_SUPERSCRIPT; + + /* Extension data */ + md->ext_flags = extensions; + md->opaque = opaque; + md->max_nesting = max_nesting; + md->max_table_cols = max_table_cols; + md->in_link_body = 0; + + return md; +} + +void +sd_markdown_render(struct buf *ob, const uint8_t *document, size_t doc_size, struct sd_markdown *md) +{ +#define MARKDOWN_GROW(x) ((x) + ((x) >> 1)) + static const char UTF8_BOM[] = {0xEF, 0xBB, 0xBF}; + + struct buf *text; + size_t beg, end; + + text = bufnew(64); + if (!text) + return; + + /* Preallocate enough space for our buffer to avoid expanding while copying */ + bufgrow(text, doc_size); + + /* reset the references table */ + memset(&md->refs, 0x0, REF_TABLE_SIZE * sizeof(void *)); + + /* first pass: looking for references, copying everything else */ + beg = 0; + + /* Skip a possible UTF-8 BOM, even though the Unicode standard + * discourages having these in UTF-8 documents */ + if (doc_size >= 3 && memcmp(document, UTF8_BOM, 3) == 0) + beg += 3; + + while (beg < doc_size) /* iterating over lines */ + if (is_ref(document, beg, doc_size, &end, md->refs)) + beg = end; + else { /* skipping to the next line */ + end = beg; + while (end < doc_size && document[end] != '\n' && document[end] != '\r') + end++; + + /* adding the line body if present */ + if (end > beg) + expand_tabs(text, document + beg, end - beg); + + while (end < doc_size && (document[end] == '\n' || document[end] == '\r')) { + /* add one \n per newline */ + if (document[end] == '\n' || (end + 1 < doc_size && document[end + 1] != '\n')) + bufputc(text, '\n'); + end++; + } + + beg = end; + } + + /* pre-grow the output buffer to minimize allocations */ + bufgrow(ob, MARKDOWN_GROW(text->size)); + + /* second pass: actual rendering */ + if (md->cb.doc_header) + md->cb.doc_header(ob, md->opaque); + + if (text->size) { + /* adding a final newline if not already present */ + if (text->data[text->size - 1] != '\n' && text->data[text->size - 1] != '\r') + bufputc(text, '\n'); + + parse_block(ob, md, text->data, text->size); + } + + if (md->cb.doc_footer) + md->cb.doc_footer(ob, md->opaque); + + /* clean-up */ + bufrelease(text); + free_link_refs(md->refs); + + assert(md->work_bufs[BUFFER_SPAN].size == 0); + assert(md->work_bufs[BUFFER_BLOCK].size == 0); +} + +void +sd_markdown_free(struct sd_markdown *md) +{ + size_t i; + + for (i = 0; i < (size_t)md->work_bufs[BUFFER_SPAN].asize; ++i) + bufrelease(md->work_bufs[BUFFER_SPAN].item[i]); + + for (i = 0; i < (size_t)md->work_bufs[BUFFER_BLOCK].asize; ++i) + bufrelease(md->work_bufs[BUFFER_BLOCK].item[i]); + + stack_free(&md->work_bufs[BUFFER_SPAN]); + stack_free(&md->work_bufs[BUFFER_BLOCK]); + + free(md); +} + +void +sd_version(int *ver_major, int *ver_minor, int *ver_revision) +{ + *ver_major = SUNDOWN_VER_MAJOR; + *ver_minor = SUNDOWN_VER_MINOR; + *ver_revision = SUNDOWN_VER_REVISION; +} + +/* vim: set filetype=c: */ diff --git a/SnudownTest/src/markdown.h b/SnudownTest/src/markdown.h new file mode 100644 index 0000000..00d50dc --- /dev/null +++ b/SnudownTest/src/markdown.h @@ -0,0 +1,140 @@ +/* markdown.h - generic markdown parser */ + +/* + * Copyright (c) 2009, Natacha Porté + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef UPSKIRT_MARKDOWN_H +#define UPSKIRT_MARKDOWN_H + +#include "buffer.h" +#include "autolink.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define SUNDOWN_VERSION "1.16.0" +#define SUNDOWN_VER_MAJOR 1 +#define SUNDOWN_VER_MINOR 16 +#define SUNDOWN_VER_REVISION 0 + +/******************** + * TYPE DEFINITIONS * + ********************/ + +/* mkd_autolink - type of autolink */ +enum mkd_autolink { + MKDA_NOT_AUTOLINK, /* used internally when it is not an autolink*/ + MKDA_NORMAL, /* normal http/http/ftp/mailto/etc link */ + MKDA_EMAIL, /* e-mail link without explit mailto: */ +}; + +enum mkd_tableflags { + MKD_TABLE_ALIGN_L = 1, + MKD_TABLE_ALIGN_R = 2, + MKD_TABLE_ALIGN_CENTER = 3, + MKD_TABLE_ALIGNMASK = 3, + MKD_TABLE_HEADER = 4 +}; + +enum mkd_extensions { + MKDEXT_NO_INTRA_EMPHASIS = (1 << 0), + MKDEXT_TABLES = (1 << 1), + MKDEXT_FENCED_CODE = (1 << 2), + MKDEXT_AUTOLINK = (1 << 3), + MKDEXT_STRIKETHROUGH = (1 << 4), + MKDEXT_SPACE_HEADERS = (1 << 6), + MKDEXT_SUPERSCRIPT = (1 << 7), + MKDEXT_LAX_SPACING = (1 << 8), + MKDEXT_NO_EMAIL_AUTOLINK = (1 << 9), +}; + +/* sd_callbacks - functions for rendering parsed data */ +struct sd_callbacks { + /* block level callbacks - NULL skips the block */ + void (*blockcode)(struct buf *ob, const struct buf *text, const struct buf *lang, void *opaque); + void (*blockquote)(struct buf *ob, const struct buf *text, void *opaque); + void (*blockhtml)(struct buf *ob,const struct buf *text, void *opaque); + void (*header)(struct buf *ob, const struct buf *text, int level, void *opaque); + void (*hrule)(struct buf *ob, void *opaque); + void (*list)(struct buf *ob, const struct buf *text, int flags, void *opaque); + void (*listitem)(struct buf *ob, const struct buf *text, int flags, void *opaque); + void (*paragraph)(struct buf *ob, const struct buf *text, void *opaque); + void (*table)(struct buf *ob, const struct buf *header, const struct buf *body, void *opaque); + void (*table_row)(struct buf *ob, const struct buf *text, void *opaque); + void (*table_cell)(struct buf *ob, const struct buf *text, int flags, void *opaque, int col_span); + + + /* span level callbacks - NULL or return 0 prints the span verbatim */ + int (*autolink)(struct buf *ob, const struct buf *link, enum mkd_autolink type, void *opaque); + int (*codespan)(struct buf *ob, const struct buf *text, void *opaque); + int (*double_emphasis)(struct buf *ob, const struct buf *text, void *opaque); + int (*emphasis)(struct buf *ob, const struct buf *text, void *opaque); + int (*image)(struct buf *ob, const struct buf *link, const struct buf *title, const struct buf *alt, void *opaque); + int (*linebreak)(struct buf *ob, void *opaque); + int (*link)(struct buf *ob, const struct buf *link, const struct buf *title, const struct buf *content, void *opaque); + int (*raw_html_tag)(struct buf *ob, const struct buf *tag, void *opaque); + int (*triple_emphasis)(struct buf *ob, const struct buf *text, void *opaque); + int (*strikethrough)(struct buf *ob, const struct buf *text, void *opaque); + int (*superscript)(struct buf *ob, const struct buf *text, void *opaque); + + /* low level callbacks - NULL copies input directly into the output */ + void (*entity)(struct buf *ob, const struct buf *entity, void *opaque); + void (*normal_text)(struct buf *ob, const struct buf *text, void *opaque); + + /* header and footer */ + void (*doc_header)(struct buf *ob, void *opaque); + void (*doc_footer)(struct buf *ob, void *opaque); +}; + +struct sd_markdown; + +/********* + * FLAGS * + *********/ + +/* list/listitem flags */ +#define MKD_LIST_ORDERED 1 +#define MKD_LI_BLOCK 2 /*
  • containing block data */ + +/********************** + * EXPORTED FUNCTIONS * + **********************/ + +extern struct sd_markdown * +sd_markdown_new( + unsigned int extensions, + size_t max_nesting, + size_t max_table_cols, + const struct sd_callbacks *callbacks, + void *opaque); + +extern void +sd_markdown_render(struct buf *ob, const uint8_t *document, size_t doc_size, struct sd_markdown *md); + +extern void +sd_markdown_free(struct sd_markdown *md); + +extern void +sd_version(int *major, int *minor, int *revision); + +#ifdef __cplusplus +} +#endif + +#endif + +/* vim: set filetype=c: */ diff --git a/SnudownTest/src/stack.c b/SnudownTest/src/stack.c new file mode 100644 index 0000000..ce069ff --- /dev/null +++ b/SnudownTest/src/stack.c @@ -0,0 +1,81 @@ +#include "stack.h" +#include + +int +stack_grow(struct stack *st, size_t new_size) +{ + void **new_st; + + if (st->asize >= new_size) + return 0; + + new_st = realloc(st->item, new_size * sizeof(void *)); + if (new_st == NULL) + return -1; + + memset(new_st + st->asize, 0x0, + (new_size - st->asize) * sizeof(void *)); + + st->item = new_st; + st->asize = new_size; + + if (st->size > new_size) + st->size = new_size; + + return 0; +} + +void +stack_free(struct stack *st) +{ + if (!st) + return; + + free(st->item); + + st->item = NULL; + st->size = 0; + st->asize = 0; +} + +int +stack_init(struct stack *st, size_t initial_size) +{ + st->item = NULL; + st->size = 0; + st->asize = 0; + + if (!initial_size) + initial_size = 8; + + return stack_grow(st, initial_size); +} + +void * +stack_pop(struct stack *st) +{ + if (!st->size) + return NULL; + + return st->item[--st->size]; +} + +int +stack_push(struct stack *st, void *item) +{ + if (stack_grow(st, st->size * 2) < 0) + return -1; + + st->item[st->size++] = item; + return 0; +} + +void * +stack_top(struct stack *st) +{ + if (!st->size) + return NULL; + + return st->item[st->size - 1]; +} + diff --git a/SnudownTest/src/stack.h b/SnudownTest/src/stack.h new file mode 100644 index 0000000..08ff030 --- /dev/null +++ b/SnudownTest/src/stack.h @@ -0,0 +1,29 @@ +#ifndef STACK_H__ +#define STACK_H__ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct stack { + void **item; + size_t size; + size_t asize; +}; + +void stack_free(struct stack *); +int stack_grow(struct stack *, size_t); +int stack_init(struct stack *, size_t); + +int stack_push(struct stack *, void *); + +void *stack_pop(struct stack *); +void *stack_top(struct stack *); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/SnudownTest/stack.c b/SnudownTest/stack.c new file mode 100644 index 0000000..ce069ff --- /dev/null +++ b/SnudownTest/stack.c @@ -0,0 +1,81 @@ +#include "stack.h" +#include + +int +stack_grow(struct stack *st, size_t new_size) +{ + void **new_st; + + if (st->asize >= new_size) + return 0; + + new_st = realloc(st->item, new_size * sizeof(void *)); + if (new_st == NULL) + return -1; + + memset(new_st + st->asize, 0x0, + (new_size - st->asize) * sizeof(void *)); + + st->item = new_st; + st->asize = new_size; + + if (st->size > new_size) + st->size = new_size; + + return 0; +} + +void +stack_free(struct stack *st) +{ + if (!st) + return; + + free(st->item); + + st->item = NULL; + st->size = 0; + st->asize = 0; +} + +int +stack_init(struct stack *st, size_t initial_size) +{ + st->item = NULL; + st->size = 0; + st->asize = 0; + + if (!initial_size) + initial_size = 8; + + return stack_grow(st, initial_size); +} + +void * +stack_pop(struct stack *st) +{ + if (!st->size) + return NULL; + + return st->item[--st->size]; +} + +int +stack_push(struct stack *st, void *item) +{ + if (stack_grow(st, st->size * 2) < 0) + return -1; + + st->item[st->size++] = item; + return 0; +} + +void * +stack_top(struct stack *st) +{ + if (!st->size) + return NULL; + + return st->item[st->size - 1]; +} + diff --git a/SnudownTest/stack.h b/SnudownTest/stack.h new file mode 100644 index 0000000..08ff030 --- /dev/null +++ b/SnudownTest/stack.h @@ -0,0 +1,29 @@ +#ifndef STACK_H__ +#define STACK_H__ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct stack { + void **item; + size_t size; + size_t asize; +}; + +void stack_free(struct stack *); +int stack_grow(struct stack *, size_t); +int stack_init(struct stack *, size_t); + +int stack_push(struct stack *, void *); + +void *stack_pop(struct stack *); +void *stack_top(struct stack *); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/SnudownTest/stdint.h b/SnudownTest/stdint.h new file mode 100644 index 0000000..6423fc8 --- /dev/null +++ b/SnudownTest/stdint.h @@ -0,0 +1,199 @@ +/* stdint.h standard header */ +#pragma once +#ifndef _STDINT +#define _STDINT +#ifndef RC_INVOKED +#include + +/* NB: assumes + byte has 8 bits + long is 32 bits + pointer can convert to and from long long + long long is longest type + */ + +_C_STD_BEGIN + /* TYPE DEFINITIONS */ +typedef signed char int8_t; +typedef short int16_t; +typedef int int32_t; + +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; + +typedef signed char int_least8_t; +typedef short int_least16_t; +typedef int int_least32_t; + +typedef unsigned char uint_least8_t; +typedef unsigned short uint_least16_t; +typedef unsigned int uint_least32_t; + +typedef char int_fast8_t; +typedef int int_fast16_t; +typedef int int_fast32_t; + +typedef unsigned char uint_fast8_t; +typedef unsigned int uint_fast16_t; +typedef unsigned int uint_fast32_t; + +#ifndef _INTPTR_T_DEFINED + #define _INTPTR_T_DEFINED + #ifdef _WIN64 +typedef __int64 intptr_t; + #else /* _WIN64 */ +typedef _W64 int intptr_t; + #endif /* _WIN64 */ +#endif /* _INTPTR_T_DEFINED */ + +#ifndef _UINTPTR_T_DEFINED + #define _UINTPTR_T_DEFINED + #ifdef _WIN64 +typedef unsigned __int64 uintptr_t; + #else /* _WIN64 */ +typedef _W64 unsigned int uintptr_t; + #endif /* _WIN64 */ +#endif /* _UINTPTR_T_DEFINED */ + +typedef _Longlong int64_t; +typedef _ULonglong uint64_t; + +typedef _Longlong int_least64_t; +typedef _ULonglong uint_least64_t; + +typedef _Longlong int_fast64_t; +typedef _ULonglong uint_fast64_t; + +typedef _Longlong intmax_t; +typedef _ULonglong uintmax_t; + + /* LIMIT MACROS */ +#define INT8_MIN (-0x7f - _C2) +#define INT16_MIN (-0x7fff - _C2) +#define INT32_MIN (-0x7fffffff - _C2) + +#define INT8_MAX 0x7f +#define INT16_MAX 0x7fff +#define INT32_MAX 0x7fffffff +#define UINT8_MAX 0xff +#define UINT16_MAX 0xffff +#define UINT32_MAX 0xffffffff + +#define INT_LEAST8_MIN (-0x7f - _C2) +#define INT_LEAST16_MIN (-0x7fff - _C2) +#define INT_LEAST32_MIN (-0x7fffffff - _C2) + +#define INT_LEAST8_MAX 0x7f +#define INT_LEAST16_MAX 0x7fff +#define INT_LEAST32_MAX 0x7fffffff +#define UINT_LEAST8_MAX 0xff +#define UINT_LEAST16_MAX 0xffff +#define UINT_LEAST32_MAX 0xffffffff + +#define INT_FAST8_MIN (-0x7f - _C2) +#define INT_FAST16_MIN (-0x7fff - _C2) +#define INT_FAST32_MIN (-0x7fffffff - _C2) + +#define INT_FAST8_MAX 0x7f +#define INT_FAST16_MAX 0x7fff +#define INT_FAST32_MAX 0x7fffffff +#define UINT_FAST8_MAX 0xff +#define UINT_FAST16_MAX 0xffff +#define UINT_FAST32_MAX 0xffffffff + + #if _INTPTR == 0 || _INTPTR == 1 +#define INTPTR_MAX 0x7fffffff +#define INTPTR_MIN (-INTPTR_MAX - _C2) +#define UINTPTR_MAX 0xffffffff + + #else /* _INTPTR == 2 */ +#define INTPTR_MIN (-_LLONG_MAX - _C2) +#define INTPTR_MAX _LLONG_MAX +#define UINTPTR_MAX _ULLONG_MAX +#endif /* _INTPTR */ + +#define INT8_C(x) (x) +#define INT16_C(x) (x) +#define INT32_C(x) ((x) + (INT32_MAX - INT32_MAX)) + +#define UINT8_C(x) (x) +#define UINT16_C(x) (x) +#define UINT32_C(x) ((x) + (UINT32_MAX - UINT32_MAX)) + +#ifdef _WIN64 + #define PTRDIFF_MIN INT64_MIN + #define PTRDIFF_MAX INT64_MAX +#else /* _WIN64 */ + #define PTRDIFF_MIN INT32_MIN + #define PTRDIFF_MAX INT32_MAX +#endif /* _WIN64 */ + +#define SIG_ATOMIC_MIN INT32_MIN +#define SIG_ATOMIC_MAX INT32_MAX + +#ifndef SIZE_MAX + #ifdef _WIN64 + #define SIZE_MAX UINT64_MAX + #else /* _WIN64 */ + #define SIZE_MAX UINT32_MAX + #endif /* _WIN64 */ +#endif /* SIZE_MAX */ + +#define WCHAR_MIN 0x0000 +#define WCHAR_MAX 0xffff + +#define WINT_MIN 0x0000 +#define WINT_MAX 0xffff + + #define INT64_MIN (-0x7fffffffffffffff - _C2) + #define INT64_MAX 0x7fffffffffffffff + #define UINT64_MAX 0xffffffffffffffffU + + #define INT_LEAST64_MIN (-0x7fffffffffffffff - _C2) + #define INT_LEAST64_MAX 0x7fffffffffffffff + #define UINT_LEAST64_MAX 0xffffffffffffffffU + + #define INT_FAST64_MIN (-0x7fffffffffffffff - _C2) + #define INT_FAST64_MAX 0x7fffffffffffffff + #define UINT_FAST64_MAX 0xffffffffffffffffU + + #define INTMAX_MIN (-0x7fffffffffffffff - _C2) + #define INTMAX_MAX 0x7fffffffffffffff + #define UINTMAX_MAX 0xffffffffffffffffU + +#define INT64_C(x) ((x) + (INT64_MAX - INT64_MAX)) +#define UINT64_C(x) ((x) + (UINT64_MAX - UINT64_MAX)) +#define INTMAX_C(x) INT64_C(x) +#define UINTMAX_C(x) UINT64_C(x) +_C_STD_END +#endif /* RC_INVOKED */ +#endif /* _STDINT */ + + #if defined(_STD_USING) +using _CSTD int8_t; using _CSTD int16_t; +using _CSTD int32_t; using _CSTD int64_t; + +using _CSTD uint8_t; using _CSTD uint16_t; +using _CSTD uint32_t; using _CSTD uint64_t; + +using _CSTD int_least8_t; using _CSTD int_least16_t; +using _CSTD int_least32_t; using _CSTD int_least64_t; +using _CSTD uint_least8_t; using _CSTD uint_least16_t; +using _CSTD uint_least32_t; using _CSTD uint_least64_t; + +using _CSTD intmax_t; using _CSTD uintmax_t; + +using _CSTD uintptr_t; +using _CSTD intptr_t; + +using _CSTD int_fast8_t; using _CSTD int_fast16_t; +using _CSTD int_fast32_t; using _CSTD int_fast64_t; +using _CSTD uint_fast8_t; using _CSTD uint_fast16_t; +using _CSTD uint_fast32_t; using _CSTD uint_fast64_t; + #endif /* defined(_STD_USING) */ + +/* + * Copyright (c) 1992-2009 by P.J. Plauger. ALL RIGHTS RESERVED. + * Consult your license regarding permissions and restrictions. +V5.20:0009 */ diff --git a/SnudownTest/sundown.def b/SnudownTest/sundown.def new file mode 100644 index 0000000..7cd41bb --- /dev/null +++ b/SnudownTest/sundown.def @@ -0,0 +1,20 @@ +LIBRARY SUNDOWN +EXPORTS + sdhtml_renderer + sdhtml_toc_renderer + sdhtml_smartypants + bufgrow + bufnew + bufcstr + bufprefix + bufput + bufputs + bufputc + bufrelease + bufreset + bufslurp + bufprintf + sd_markdown_new + sd_markdown_render + sd_markdown_free + sd_version \ No newline at end of file diff --git a/SnudownTest/test_snudown.py b/SnudownTest/test_snudown.py new file mode 100644 index 0000000..fa9568f --- /dev/null +++ b/SnudownTest/test_snudown.py @@ -0,0 +1,461 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import snudown +import unittest +import itertools +import cStringIO as StringIO + + +cases = { + '': '', + 'http://www.reddit.com': + '

    http://www.reddit.com

    \n', + + 'http://www.reddit.com/a\x00b': + '

    http://www.reddit.com/ab

    \n', + + 'foo@example.com': + '

    foo@example.com

    \n', + + '[foo](http://en.wikipedia.org/wiki/Link_(film\))': + '

    foo

    \n', + + '(http://tsfr.org)': + '

    (http://tsfr.org)

    \n', + + '[A link with a /r/subreddit in it](/lol)': + '

    A link with a /r/subreddit in it

    \n', + + '[A link with a http://www.url.com in it](/lol)': + '

    A link with a http://www.url.com in it

    \n', + + '[Empty Link]()': + '

    [Empty Link]()

    \n', + + 'http://en.wikipedia.org/wiki/café_racer': + '

    http://en.wikipedia.org/wiki/café_racer

    \n', + + '#####################################################hi': + '
    ###############################################hi
    \n', + + '[foo](http://bar\nbar)': + '

    foo

    \n', + + '/r/test': + '

    /r/test

    \n', + + 'Words words /r/test words': + '

    Words words /r/test words

    \n', + + '/r/': + '

    /r/

    \n', + + r'escaped \/r/test': + '

    escaped /r/test

    \n', + + 'ampersands http://www.google.com?test&blah': + '

    ampersands http://www.google.com?test&blah

    \n', + + '[_regular_ link with nesting](/test)': + '

    regular link with nesting

    \n', + + ' www.a.co?with&test': + '

    www.a.co?with&test

    \n', + + r'Normal^superscript': + '

    Normalsuperscript

    \n', + + r'Escape\^superscript': + '

    Escape^superscript

    \n', + + r'~~normal strikethrough~~': + '

    normal strikethrough

    \n', + + r'\~~escaped strikethrough~~': + '

    ~~escaped strikethrough~~

    \n', + + 'anywhere\x03, you': + '

    anywhere, you

    \n', + + '[Test](//test)': + '

    Test

    \n', + + '[Test](//#test)': + '

    Test

    \n', + + '[Test](#test)': + '

    Test

    \n', + + '[Test](git://github.com)': + '

    Test

    \n', + + '[Speculation](//?)': + '

    Speculation

    \n', + + '/r/sr_with_underscores': + '

    /r/sr_with_underscores

    \n', + + '[Test](///#test)': + '

    Test

    \n', + + '/r/multireddit+test+yay': + '

    /r/multireddit+test+yay

    \n', + + '': + '

    <test>

    \n', + + 'words_with_underscores': + '

    words_with_underscores

    \n', + + 'words*with*asterisks': + '

    wordswithasterisks

    \n', + + '~test': + '

    ~test

    \n', + + '/u/test': + '

    /u/test

    \n', + + '/u/test/m/test test': + '

    /u/test/m/test test

    \n', + + '/U/nope': + '

    /U/nope

    \n', + + '/r/test/m/test test': + '

    /r/test/m/test test

    \n', + + '/r/test/w/test test': + '

    /r/test/w/test test

    \n', + + '/r/test/comments/test test': + '

    /r/test/comments/test test

    \n', + + '/u/test/commentscommentscommentscommentscommentscommentscomments/test test': + '

    /u/test/commentscommentscommentscommentscommentscommentscomments/test test

    \n', + + 'a /u/reddit': + '

    a /u/reddit

    \n', + + 'u/reddit': + '

    u/reddit

    \n', + + 'a u/reddit': + '

    a u/reddit

    \n', + + 'a u/reddit/foobaz': + '

    a u/reddit/foobaz

    \n', + + 'foo:u/reddit': + '

    foo:u/reddit

    \n', + + 'fuu/reddit': + '

    fuu/reddit

    \n', + + # Don't treat unicode punctuation as a word boundary for now + u'a。u/reddit'.encode('utf8'): + u'

    a。u/reddit

    \n'.encode('utf8'), + + '\\/u/me': + '

    /u/me

    \n', + + '\\\\/u/me': + '

    \\/u/me

    \n', + + '\\u/me': + '

    \\u/me

    \n', + + '\\\\u/me': + '

    \\u/me

    \n', + + 'u\\/me': + '

    u/me

    \n', + + '*u/me*': + '

    u/me

    \n', + + 'foo^u/me': + '

    foou/me

    \n', + + '*foo*u/me': + '

    foou/me

    \n', + + 'u/me': + '

    u/me

    \n', + + '/u/me': + '

    /u/me

    \n', + + 'u/m': + '

    u/m

    \n', + + '/u/m': + '

    /u/m

    \n', + + '/f/oobar': + '

    /f/oobar

    \n', + + 'f/oobar': + '

    f/oobar

    \n', + + '/r/test/commentscommentscommentscommentscommentscommentscomments/test test': + '

    /r/test/commentscommentscommentscommentscommentscommentscomments/test test

    \n', + + 'blah \\': + '

    blah \\

    \n', + + '/r/whatever: fork': + '

    /r/whatever: fork

    \n', + + '/r/t:timereddit': + '

    /r/t:timereddit

    \n', + + '/r/reddit.com': + '

    /r/reddit.com

    \n', + + '/r/not.cool': + '

    /r/not.cool

    \n', + + '/r/very+clever+multireddit+reddit.com+t:fork+yay': + '

    /r/very+clever+multireddit+reddit.com+t:fork+yay

    \n', + + '/r/t:heatdeathoftheuniverse': + '

    /r/t:heatdeathoftheuniverse

    \n', + + '/r/all-minus-something': + '

    /r/all-minus-something

    \n', + + '/r/notall-minus': + '

    /r/notall-minus

    \n', + + 'a /r/reddit.com': + '

    a /r/reddit.com

    \n', + + 'a r/reddit.com': + '

    a r/reddit.com

    \n', + + 'foo:r/reddit.com': + '

    foo:r/reddit.com

    \n', + + 'foobar/reddit.com': + '

    foobar/reddit.com

    \n', + + u'a。r/reddit.com'.encode('utf8'): + u'

    a。r/reddit.com

    \n'.encode('utf8'), + + '/R/reddit.com': + '

    /R/reddit.com

    \n', + + '/r/irc://foo.bar/': + '

    /r/irc://foo.bar/

    \n', + + '/r/t:irc//foo.bar/': + '

    /r/t:irc//foo.bar/

    \n', + + '/r/all-irc://foo.bar/': + '

    /r/all-irc://foo.bar/

    \n', + + '/r/foo+irc://foo.bar/': + '

    /r/foo+irc://foo.bar/

    \n', + + '/r/www.example.com': + '

    /r/www.example.com

    \n', + + '.http://reddit.com': + '

    .http://reddit.com

    \n', + + '[r://](/aa)': + '

    r://http://reddit.com/

    \n', + + '/u/http://www.reddit.com/user/reddit': + '

    /u/http://www.reddit.com/user/reddit

    \n', + + 'www.http://example.com/': + '

    www.http://example.com/

    \n', + + ('|' * 5) + '\n' + ('-|' * 5) + '\n|\n': + '\n\n' + ('\n' * 4) + '\n\n\n\n\n
    \n', + + ('|' * 2) + '\n' + ('-|' * 2) + '\n|\n': + '\n\n' + ('\n' * 1) + '\n\n\n\n\n
    \n', + + ('|' * 65) + '\n' + ('-|' * 65) + '\n|\n': + '\n\n' + ('\n' * 64) + '\n\n\n\n\n
    \n', + + ('|' * 66) + '\n' + ('-|' * 66) + '\n|\n': + '

    ' + ('|' * 66) + '\n' + ('-|' * 66) + '\n|' + '

    \n', + + 'ϑ': + '

    ϑ

    \n', + + '&foobar;': + '

    &foobar;

    \n', + + ' ': + '

    &nbsp

    \n', + + '&#foobar;': + '

    &#foobar;

    \n', + + 'oobar;': + '

    &#xfoobar;

    \n', + + '�': + '

    &#9999999999;

    \n', + + 'c': + '

    c

    \n', + + '~': + '

    ~

    \n', + + '~': + '

    ~

    \n', + + '½': + '

    ½

    \n', + + 'aaa½aaa': + '

    aaa½aaa

    \n', + + '&': + '

    &

    \n', + + '&;': + '

    &;

    \n', + + '&#;': + '

    &#;

    \n', + + '&#;': + '

    &#;

    \n', + + '&#x;': + '

    &#x;

    \n', +} + +# Test that every numeric entity is encoded as +# it should be. +ILLEGAL_NUMERIC_ENTS = frozenset(itertools.chain( + xrange(0, 9), + xrange(11, 13), + xrange(14, 32), + xrange(55296, 57344), + xrange(65534, 65536), +)) + +ent_test_key = '' +ent_test_val = '' +for i in xrange(65550): + ent_testcase = '&#%d;&#x%x;' % (i, i) + ent_test_key += ent_testcase + if i in ILLEGAL_NUMERIC_ENTS: + ent_test_val += ent_testcase.replace('&', '&') + else: + ent_test_val += ent_testcase + +cases[ent_test_key] = '

    %s

    \n' % ent_test_val + +wiki_cases = { + '': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', + + '
    ': + '

    \n', +} + +class SnudownTestCase(unittest.TestCase): + def __init__(self, renderer=snudown.RENDERER_USERTEXT): + self.renderer = renderer + unittest.TestCase.__init__(self) + + def runTest(self): + output = snudown.markdown(self.input, renderer=self.renderer) + + for i, (a, b) in enumerate(zip(repr(self.expected_output), + repr(output))): + if a != b: + io = StringIO.StringIO() + print >> io, "TEST FAILED:" + print >> io, " input: %s" % repr(self.input) + print >> io, " expected: %s" % repr(self.expected_output) + print >> io, " actual: %s" % repr(output) + print >> io, " %s" % (' ' * i + '^') + self.fail(io.getvalue()) + + + +def test_snudown(): + suite = unittest.TestSuite() + + for input, expected_output in wiki_cases.iteritems(): + case = SnudownTestCase(renderer=snudown.RENDERER_WIKI) + case.input = input + case.expected_output = expected_output + suite.addTest(case) + + for input, expected_output in cases.iteritems(): + case = SnudownTestCase() + case.input = input + case.expected_output = expected_output + suite.addTest(case) + + return suite diff --git a/Symlinker/symlinker.py b/Symlinker/symlinker.py new file mode 100644 index 0000000..083d770 --- /dev/null +++ b/Symlinker/symlinker.py @@ -0,0 +1,153 @@ +import os +import sys +import string +import tkinter +import time +import traceback + +time.clock() + +LINKTYPES_L = ['Symbolic file', 'Hardlink file', 'Symbolic dir', 'Junction dir'] +LINKTYPES = {'Symbolic file': '', + 'Hardlink file': '/H', + 'Symbolic dir': '/D', + 'Junction dir': '/J' + } +LINKTYPES_DIR = ['Symbolic dir', 'Junction dir', LINKTYPES['Symbolic dir'], LINKTYPES['Junction dir']] +LINKTYPES_FILE = ['Symbolic file', 'Hardlink file', LINKTYPES['Symbolic file'], LINKTYPES['Hardlink file']] + +TRACER_AUTOVERIFY_DELAY = 0.5 + +COLOR_BLACK = '#000' +COLOR_YELLOW = '#aa0' +COLOR_GREEN = '#0a0' +COLOR_RED = '#a00' + +def assert_linktypes(linktype, symbolpath, actualpath): + if os.path.isdir(actualpath) and linktype not in LINKTYPES_DIR or \ + os.path.isfile(actualpath) and linktype not in LINKTYPES_FILE: + message = 'Invalid linktype {linktype} for target path {target}' + message = message.format(linktype=repr(linktype), target=repr(actualpath)) + raise TypeError(message) + +def mklink(linktype, symbolpath, actualpath): + symbolpath = os.path.abspath(symbolpath) + actualpath = os.path.abspath(actualpath) + try: + assert_linktypes(linktype, symbolpath, actualpath) + except TypeError: + traceback.print_exc() + return False + command = 'mklink {linktype} "{symbolpath}" "{actualpath}"' + command = command.format(linktype=linktype, + symbolpath=symbolpath, + actualpath=actualpath) + print(''.join(c for c in command if c in string.printable)) + status_code = os.system(command) + if status_code != 0: + return False + if linktype in LINKTYPES_DIR: + symtype = 'symlink' if linktype == '/D' else 'junction' + symlink_info = symtype + time.strftime('_%Y%m%d-%H%M%S.txt') + symlink_info = os.path.join(actualpath, symlink_info) + symlink_info = open(symlink_info, 'w') + symlink_info.write('actual: ' + actualpath) + symlink_info.write('\n') + symlink_info.write(symtype + ': ' + symbolpath) + symlink_info.close() + + +class LinkGUI: + def __init__(self): + self.t = tkinter.Tk() + + self.tracer_nextautoverify = 0 + self.tracer_lastkeystroke_verified = False + self.tracer_activewaiter = False + + self.stringvar_actualpath = tkinter.StringVar() + self.stringvar_dropdown = tkinter.StringVar() + self.label_actualpath = tkinter.Label(self.t, text='Actual path:') + self.label_symbolpath = tkinter.Label(self.t, text='Symbol path:') + self.entry_actualpath = tkinter.Entry(self.t, width=70, textvariable=self.stringvar_actualpath) + self.entry_symbolpath = tkinter.Entry(self.t, width=70) + self.dropdown_linktype = tkinter.OptionMenu(self.t, self.stringvar_dropdown, *LINKTYPES_L) + self.dropdown_linktype.configure(width=15) + self.button_do_it = tkinter.Button(self.t, text='Do it.', command=self.do_it) + + self.stringvar_actualpath.trace('w', self.tracewatcher) + self.stringvar_dropdown.trace('w', lambda *bb: self.tracer_verify_colors(False)) + self.stringvar_actualpath.set(os.getcwd()) + self.entry_symbolpath.insert(0, os.getcwd()) + + self.label_actualpath.grid(row=0, column=0, sticky='e') + self.label_symbolpath.grid(row=1, column=0, sticky='e') + self.entry_actualpath.grid(row=0, column=1, sticky='ew') + self.entry_symbolpath.grid(row=1, column=1, sticky='ew') + self.dropdown_linktype.grid(row=2, column=0) + self.button_do_it.grid(row=2, column=1, sticky='e') + + self.t.grid_columnconfigure(1, weight=1) + self.t.mainloop() + + def do_it(self, *bb): + linktype = self.stringvar_dropdown.get() + linktype = LINKTYPES[linktype] + actualpath = self.entry_actualpath.get() + symbolpath = self.entry_symbolpath.get() + status = mklink(linktype, actualpath=actualpath, symbolpath=symbolpath) + if status is False: + self.button_do_it.configure(bg=COLOR_RED) + else: + self.button_do_it.configure(bg=COLOR_GREEN) + + def tracewatcher(self, *bb): + self.tracer_lastkeystroke_verified = False + self.tracer_nextautoverify = time.time() + TRACER_AUTOVERIFY_DELAY + if self.tracer_activewaiter is False: + self.tracer_verify() + + def tracer_verify(self): + now = time.time() + if self.tracer_lastkeystroke_verified is True: + return + if now < self.tracer_nextautoverify: + delay = int(TRACER_AUTOVERIFY_DELAY * 1000) + self.t.after(delay, self.tracer_verify) + self.tracer_activewaiter = True + self.dropdown_linktype.config(fg=COLOR_YELLOW) + return + self.tracer_lastkeystroke_verified = True + self.tracer_activewaiter = False + + self.tracer_verify_colors(set_for_me=True) + + def tracer_verify_colors(self, set_for_me=False, *bb): + path = self.stringvar_actualpath.get() + linktype = self.stringvar_dropdown.get() + if os.path.isfile(path): + if set_for_me and linktype not in LINKTYPES_FILE: + self.stringvar_dropdown.set('Symbolic file') + return + if linktype in LINKTYPES_FILE: + self.dropdown_linktype.config(fg=COLOR_GREEN) + else: + self.dropdown_linktype.config(fg=COLOR_BLACK) + elif os.path.isdir(path): + if set_for_me and linktype not in LINKTYPES_DIR: + self.stringvar_dropdown.set('Symbolic dir') + return + if linktype in LINKTYPES_DIR: + self.dropdown_linktype.config(fg=COLOR_GREEN) + else: + self.dropdown_linktype.config(fg=COLOR_BLACK) + else: + self.dropdown_linktype.config(fg=COLOR_BLACK) + +#mklink(LINKTYPE_SYMBOLIC_DIR, 'examples\\symbolic_dir', 'examples\\actual_dir') +#mklink(LINKTYPE_JUNCTION_DIR, 'examples\\junction_dir', 'examples\\actual_dir') +#mklink(LINKTYPE_SYMBOLIC_FILE, 'examples\\symbolic_file.txt', 'examples\\actual_file.txt') +#mklink(LINKTYPE_HARDLINK_FILE, 'examples\\hardlink_file.txt', 'examples\\actual_file.txt') +linker = LinkGUI() +print('[ {0} elapsed ]'.format(round(time.clock(), 3))) + diff --git a/gitnotes.txt b/gitnotes.txt new file mode 100644 index 0000000..44cba88 --- /dev/null +++ b/gitnotes.txt @@ -0,0 +1 @@ +git filter-branch -f --index-filter "git rm -r --cached --ignore-unmatch SubredditBirthdays\sql.db" HEAD \ No newline at end of file