1 {-|
    2 Module      : rectifier
    3 Description : Convert our "wiki-style" markdown links to exact titles/headers/filenames.
    4 Copyright   : (c) Robin Lee Powell, 2016
    5                   Someone Else, 2014
    6 License     : MIT
    7 Maintainer  : rlpowell@digitalkingdom.org
    8 Stability   : experimental
    9 Portability : POSIX
   10 
   11 Check our "wiki-style" markdown links, which match fuzzily in
   12 various ways, to make sure they point at something unique, and
   13 replace them with the exact titles/headers/filenames of the thing
   14 they point at.
   15 
   16 See "Wiki Links" in DESIGN-CODE for full details.
   17 
   18 -}
   19 --------------------------------------------------------------------------------
   20 {-# LANGUAGE OverloadedStrings #-}
   21 {-# LANGUAGE QuasiQuotes #-}
   22 {-# LANGUAGE FlexibleContexts #-}
   23 module Main where
   24 import System.Directory (createDirectoryIfMissing)
   25 import System.FilePath (makeRelative, (</>), takeDirectory, takeBaseName)
   26 import System.Environment (getArgs)
   27 import qualified System.FilePath.Find as F (find, always, extension)
   28 import System.FilePath.Find ((==?))
   29 import Control.Monad (mapM)
   30 import Text.Pandoc (readMarkdown, Inline(..), docTitle, readerStandalone, writePlain, writeMarkdown, writerTemplate, runPure, PandocError(..))
   31 import Text.Pandoc.Templates (getDefaultTemplate)
   32 import Text.Pandoc.Walk (query, walk)
   33 import Text.Pandoc.Definition (Pandoc(..), Block(..), nullMeta)
   34 import Network.URI (unEscapeString, isURI)
   35 import Data.List (find, intercalate, sortBy)
   36 import Data.Maybe (isJust)
   37 import Data.Char (toLower)
   38 -- import Debug.Trace
   39 import Text.Regex.PCRE.Heavy ((=~))
   40 import Text.Regex.PCRE.Light (compile, caseless)
   41 import Data.ByteString.Char8 (pack)
   42 import Text.EditDistance (defaultEditCosts, levenshteinDistance)
   43 import Data.Function (on)
   44 import qualified Data.Text as T
   45 import HBlog.Lib
   46 
   47 --------------------------------------------------------------------------------
   48 
   49 -- Headers and titles from markdown files.
   50 data Target = THeader { tTarget :: String, tFile :: FilePath } | TTitle { tTarget :: String, tFile :: FilePath } deriving (Eq, Ord, Show)
   51 
   52 isHeader :: Target -> Bool
   53 isHeader (THeader _ _) = True
   54 isHeader _ = False
   55 
   56 isTitle :: Target -> Bool
   57 isTitle (TTitle _ _) = True
   58 isTitle _ = False
   59 
   60 
   61 --------------------------------------------------------------------------------
   62 main :: IO ()
   63 main = do
   64   args <- getArgs
   65   case args of
   66     indir:outdir:[] -> walkTree indir outdir
   67     _               -> putStrLn "Need exactly two arguments, input directory and output directory."
   68 
   69 -- Walk the tree of input files, rewriting as necessary the links in
   70 -- each file to point to the full text of link targets that actually
   71 -- exist.
   72 --
   73 -- A mildly unfortunate issue is that we walk the files twice; once
   74 -- to get all the targets and again to munge the links to them.
   75 -- It's not obvious that that's fixable, though, and it's not like
   76 -- we're talking about thousands of files.
   77 walkTree :: String -> String -> IO ()
   78 walkTree indir outdir = do
   79   -- Grab the possible link targets
   80   targets <- targetPrep indir
   81   -- error $ "targets: " ++ show targets
   82   putStrLn $ "Searching for files ending in .md"
   83   files <- F.find F.always (F.extension ==? ".md") indir 
   84   putStrLn $ "Files found: " ++ (intercalate " " files)
   85   _ <- mapM (handleFile indir outdir targets) files
   86   return ()
   87 
   88 -- Walks the file tree running readTargets
   89 targetPrep :: FilePath -> IO [Target]
   90 targetPrep indir = do
   91   files <- F.find F.always (F.extension ==? ".md") indir 
   92   targets <- mapM readTargets files
   93   return $ filter (\x -> (tTarget x) /= "") $ concat targets
   94 
   95 -- Gather all the titles and headers from all the files, along with
   96 -- their file namers, and turn them into Targets
   97 --
   98 -- We don't care about anything else about these files; we're just
   99 -- making sure that each link points to a valid target.
  100 readTargets :: FilePath -> IO [Target]
  101 readTargets file = do
  102   body <- readFile file
  103   let pandocEither = runPure $ readMarkdown hblogPandocReaderOptions $ T.pack body
  104   return $ concat [titleQuery pandocEither file, headersQuery pandocEither file]
  105 
  106 -- Turn titles into Targets
  107 titleQuery :: Either PandocError Pandoc -> FilePath -> [Target]
  108 titleQuery (Right (Pandoc meta _)) file =
  109   [TTitle { tFile = file, tTarget = target }]
  110   where
  111     target = case runPure $ writePlain hblogPandocWriterOptions $ Pandoc nullMeta $ [Plain $ docTitle meta] of
  112                   Left (PandocSomeError err)  -> "rectifier titleQuery: unknown error: " ++ err
  113                   Left _ -> "rectifier titleQuery: unknown error!"
  114                   Right item'              -> T.unpack item'
  115 titleQuery (Left e) file = error $ "Pandoc error! on file " ++ file ++ ": " ++ (show e)
  116 
  117 -- Turn headers into Targets
  118 headersQuery :: Either PandocError Pandoc -> FilePath -> [Target]
  119 headersQuery (Right x) file = map (\str -> THeader { tFile = file, tTarget = str}) $ query getHeaders x
  120 headersQuery (Left e) file = error $ "Pandoc error! on file " ++ file ++ ": " ++ (show e)
  121 
  122 -- Pull headers out as strings
  123 getHeaders :: Block -> [String]
  124 getHeaders (Header _ _ xs) =
  125   [headers]
  126   where
  127     headers = case runPure $ writePlain hblogPandocWriterOptions $ Pandoc nullMeta $ [Plain xs] of
  128                   Left (PandocSomeError err)  -> "rectifier getHeaders: unknown error: " ++ err
  129                   Left _ -> "rectifier getHeaders: unknown error!"
  130                   Right item'              -> T.unpack item'
  131 getHeaders _ = []
  132 
  133 -- Basically a wrapper for rectifie that does the IO bits; the code
  134 -- below this is pure
  135 handleFile :: FilePath -> FilePath -> [Target] -> FilePath -> IO ()
  136 handleFile indir outdir targets file = do
  137   putStrLn $ "Processing file " ++ file
  138   let shortname = makeRelative indir file
  139   body <- readFile $ indir </> shortname
  140   Right mdTemplate <- getDefaultTemplate Nothing "markdown"
  141   let newBody = rectify body file targets mdTemplate in do
  142     _ <- createDirectoryIfMissing True (takeDirectory $ (outdir </> shortname))
  143     _ <- writeFile (outdir </> shortname) newBody
  144     return ()
  145 
  146 -- Read, munge, and rebuild the markdown file we're working on
  147 rectify :: String -> FilePath -> [Target] -> String -> String
  148 rectify body file targets mdTemplate =
  149   let pandocEither = runPure $ readMarkdown hblogPandocReaderOptions $ T.pack body
  150       newPandoc = linksWalk pandocEither file targets
  151       in
  152         case runPure $ writeMarkdown hblogPandocWriterOptions { writerTemplate = Just mdTemplate } newPandoc of
  153                   Left (PandocSomeError err)  -> "rectifier rectify: unknown error: " ++ err
  154                   Left _ -> "rectifier rectify: unknown error!"
  155                   Right item'              -> T.unpack item'
  156 
  157 -- Walk the Pandoc
  158 linksWalk :: Either PandocError Pandoc -> FilePath -> [Target] -> Pandoc
  159 linksWalk (Right x) file targets = walk (replaceLink file targets) x
  160 linksWalk (Left e) file _ = error $ "Pandoc error! on file " ++ file ++ ": " ++ (show e)
  161 
  162 -- Return true if the target is a header in the specified file
  163 isFileHeader :: FilePath -> Target -> Bool
  164 isFileHeader file target = isHeader target && tFile target == file
  165 
  166 -- Replaces the given link; mostly this is just a wrapper for
  167 -- findTarget, but there's some complexity if it's a fragment style
  168 -- link
  169 replaceLink :: FilePath -> [Target] -> Inline -> Inline
  170 replaceLink file targets linky@(Link x y (ickyLinkStr, z)) =
  171   let linkStr = unEscapeString ickyLinkStr in
  172     -- Step 1 from "Rectification" in DESIGN-CODE : check if it
  173     -- looks like a URL or an absolute path within our blog, and
  174     -- ignore it in those cases
  175     if isURI linkStr || head linkStr == '/' then
  176       linky
  177     else
  178       if (length $ filter (== '#') linkStr) == 1 then
  179         -- If it's a fragment (a # link), first check the left side
  180         -- of the # and if that matches a title, look in that file
  181         -- for header matches
  182         let maybeGoodTarget = findTarget (fst $ span (/= '#') linkStr) isTitle targets in
  183             case maybeGoodTarget of
  184               Nothing -> error $ "Failed to find valid main target (the part before #) for link \"" ++ linkStr ++ "\" in file " ++ file
  185               Just goodTarget ->
  186                 -- Now that we have the file, look for the fragment
  187                 -- in the headers of that file
  188                 let maybeGoodFragTarget = findTarget (tail $ snd $ span (/= '#') linkStr) (isFileHeader (tFile goodTarget)) targets in
  189                   case maybeGoodFragTarget of
  190                     Nothing -> error $ "Failed to find valid fragment target (the part after #) for link \"" ++ linkStr ++ "\" in file " ++ file
  191                     Just goodFragTarget -> Link x y ((tTarget goodTarget) ++ "#" ++ (tTarget goodFragTarget), z)
  192       else
  193         -- No # ; try all the internal file headers first, and then
  194         -- all the titles.
  195         let twoMaybes = find isJust [
  196                   findTarget linkStr (isTitle) targets
  197                 , findTarget linkStr (isFileHeader file) targets
  198               ] in
  199             -- FIXME: There *must* be a more idiomatic way to do this
  200             case twoMaybes of
  201               Nothing -> noTarget linkStr file targets
  202               Just Nothing -> noTarget linkStr file targets
  203               Just (Just goodTarget) -> Link x y ((tTarget goodTarget), z)
  204 replaceLink _ _ x = x
  205 
  206 -- Return true if the target is visible to us, that is, is a title
  207 -- or is a header in this same file.
  208 isTitleOrFileHeader :: FilePath -> Target -> Bool
  209 isTitleOrFileHeader file target = isFileHeader file target || isTitle target
  210 
  211 levenshteinTargetSort :: String -> [Target] -> FilePath -> [Char]
  212 levenshteinTargetSort linkStr targets file =
  213   intercalate "\n" $
  214     sortBy (compare `on` (levenshteinDistance defaultEditCosts) linkStr) $
  215     map tTarget $
  216     filter (isTitleOrFileHeader file) targets
  217 
  218 noTarget :: [Char] -> [Char] -> [Target] -> a
  219 noTarget linkStr file targets = error $ "Failed to find valid target for link \"" ++ linkStr ++ "\" in file " ++ file ++
  220   "\n\nPossible targets include:\n\n" ++ (levenshteinTargetSort linkStr targets file) ++ "\n\n"
  221 
  222 makeFuzzy :: String -> String
  223 makeFuzzy input = ".*" ++ (intercalate ".*" $ words input) ++ ".*"
  224 
  225 leftRegex :: String -> String -> Bool
  226 leftRegex left right = right =~ (compile (pack left) [caseless])
  227 
  228 -- This is where we actually walk through the steps from
  229 -- "Rectification" in DESIGN-CODE (except step 1); details are there.
  230 findTarget :: String -> (Target -> Bool) -> [Target] -> Maybe Target
  231 findTarget linkStr myFilter targets =
  232   -- We want to stop immediately once we find something that works.
  233   let twoMaybes = find isJust [
  234           -- Step 2
  235           applyToBothAndFind id                 tTarget                         (==) linkStr (filter myFilter targets)
  236           -- Step 3
  237         , applyToBothAndFind (map toLower)      ((map toLower) . tTarget)       (==) linkStr (filter myFilter targets)
  238           -- Step 4
  239         , applyToBothAndFind trimMD             (trimMD . tFile)                (==) linkStr (filter myFilter targets)
  240           -- Step 5
  241         , applyToBothAndFind trimMDLower        (trimMDLower . tFile)           (==) linkStr (filter myFilter targets)
  242           -- Step 6
  243         , applyToBothAndFind makeFuzzy          tTarget                         (leftRegex) linkStr (filter myFilter targets)
  244         , applyToBothAndFind makeFuzzy          tFile                           (leftRegex) linkStr (filter myFilter targets)
  245         , Nothing
  246         ] in
  247           -- FIXME: There *must* be a more idiomatic way to do this
  248           case twoMaybes of
  249             Nothing -> Nothing
  250             Just Nothing -> Nothing
  251             Just (Just x) -> Just x
  252 
  253 -- Trim ".md" from the file name
  254 trimMD :: FilePath -> String
  255 trimMD fname = let maybeStripped = T.stripSuffix ".md" $ T.pack $ (takeBaseName fname)
  256   in maybe fname T.unpack maybeStripped
  257 
  258 -- Trim ".md" from the file name and force it to lowercase
  259 trimMDLower :: FilePath -> String
  260 trimMDLower fname = map toLower (trimMD (takeBaseName fname))
  261 
  262 -- Take left and right string munging functions, a comparison
  263 -- function, the link string we want to rectify, and the targets to
  264 -- use to rectify it. Applies the munging functions to the
  265 -- appropriate sides of the comparison, and runs the comparison
  266 -- function.
  267 applyToBothAndFind :: (String -> String) -> (Target -> String) -> (String -> String -> Bool) -> String -> [Target] -> Maybe Target
  268 applyToBothAndFind _ _ _ _ [] = Nothing
  269 applyToBothAndFind mungeLeft mungeRight comp linkStr xs =
  270   let findings = applyToBothAndFindInner mungeLeft mungeRight comp linkStr xs in
  271     if length findings > 1 then
  272       error $ "Found more than one match for \"" ++ linkStr ++ "\": " ++ (intercalate " " $ map tFile findings)
  273     else
  274     if length findings < 1 then
  275       Nothing
  276     else
  277       Just $ head findings
  278 
  279 applyToBothAndFindInner :: (String -> String) -> (Target -> String) -> (String -> String -> Bool) -> String -> [Target] -> [Target]
  280 applyToBothAndFindInner _ _ _ _ [] = []
  281 applyToBothAndFindInner mungeLeft mungeRight comp linkStr (target:xs) =
  282   -- trace ("mlls: " ++ (mungeLeft linkStr)) $
  283   -- trace ("mrt: " ++ (mungeRight target)) $
  284   if (mungeLeft linkStr) `comp` (mungeRight target) then
  285     [target] ++ (applyToBothAndFindInner mungeLeft mungeRight comp linkStr xs)
  286   else
  287     applyToBothAndFindInner mungeLeft mungeRight comp linkStr xs