1 {-| 2 Module : rectifier 3 Description : Convert our "wiki-style" markdown links to exact titles/headers/filenames. 4 Copyright : (c) Robin Lee Powell, 2016 5 Someone Else, 2014 6 License : MIT 7 Maintainer : rlpowell@digitalkingdom.org 8 Stability : experimental 9 Portability : POSIX 10 11 Check our "wiki-style" markdown links, which match fuzzily in 12 various ways, to make sure they point at something unique, and 13 replace them with the exact titles/headers/filenames of the thing 14 they point at. 15 16 See "Wiki Links" in DESIGN-CODE for full details. 17 18 -} 19 -------------------------------------------------------------------------------- 20 {-# LANGUAGE OverloadedStrings #-} 21 {-# LANGUAGE QuasiQuotes #-} 22 {-# LANGUAGE FlexibleContexts #-} 23 module Main where 24 import System.Directory (createDirectoryIfMissing) 25 import System.FilePath (makeRelative, (</>), takeDirectory, takeBaseName) 26 import System.Environment (getArgs) 27 import qualified System.FilePath.Find as F (find, always, extension) 28 import System.FilePath.Find ((==?)) 29 import Control.Monad (mapM) 30 import Text.Pandoc (readMarkdown, Inline(..), docTitle, readerStandalone, writePlain, writeMarkdown, writerTemplate, runPure, PandocError(..)) 31 import Text.Pandoc.Templates (getDefaultTemplate) 32 import Text.Pandoc.Walk (query, walk) 33 import Text.Pandoc.Definition (Pandoc(..), Block(..), nullMeta) 34 import Network.URI (unEscapeString, isURI) 35 import Data.List (find, intercalate, sortBy) 36 import Data.Maybe (isJust) 37 import Data.Char (toLower) 38 -- import Debug.Trace 39 import Text.Regex.PCRE.Heavy ((=~)) 40 import Text.Regex.PCRE.Light (compile, caseless) 41 import Data.ByteString.Char8 (pack) 42 import Text.EditDistance (defaultEditCosts, levenshteinDistance) 43 import Data.Function (on) 44 import qualified Data.Text as T 45 import HBlog.Lib 46 47 -------------------------------------------------------------------------------- 48 49 -- Headers and titles from markdown files. 50 data Target = THeader { tTarget :: String, tFile :: FilePath } | TTitle { tTarget :: String, tFile :: FilePath } deriving (Eq, Ord, Show) 51 52 isHeader :: Target -> Bool 53 isHeader (THeader _ _) = True 54 isHeader _ = False 55 56 isTitle :: Target -> Bool 57 isTitle (TTitle _ _) = True 58 isTitle _ = False 59 60 61 -------------------------------------------------------------------------------- 62 main :: IO () 63 main = do 64 args <- getArgs 65 case args of 66 indir:outdir:[] -> walkTree indir outdir 67 _ -> putStrLn "Need exactly two arguments, input directory and output directory." 68 69 -- Walk the tree of input files, rewriting as necessary the links in 70 -- each file to point to the full text of link targets that actually 71 -- exist. 72 -- 73 -- A mildly unfortunate issue is that we walk the files twice; once 74 -- to get all the targets and again to munge the links to them. 75 -- It's not obvious that that's fixable, though, and it's not like 76 -- we're talking about thousands of files. 77 walkTree :: String -> String -> IO () 78 walkTree indir outdir = do 79 -- Grab the possible link targets 80 targets <- targetPrep indir 81 -- error $ "targets: " ++ show targets 82 putStrLn $ "Searching for files ending in .md" 83 files <- F.find F.always (F.extension ==? ".md") indir 84 putStrLn $ "Files found: " ++ (intercalate " " files) 85 _ <- mapM (handleFile indir outdir targets) files 86 return () 87 88 -- Walks the file tree running readTargets 89 targetPrep :: FilePath -> IO [Target] 90 targetPrep indir = do 91 files <- F.find F.always (F.extension ==? ".md") indir 92 targets <- mapM readTargets files 93 return $ filter (\x -> (tTarget x) /= "") $ concat targets 94 95 -- Gather all the titles and headers from all the files, along with 96 -- their file namers, and turn them into Targets 97 -- 98 -- We don't care about anything else about these files; we're just 99 -- making sure that each link points to a valid target. 100 readTargets :: FilePath -> IO [Target] 101 readTargets file = do 102 body <- readFile file 103 let pandocEither = runPure $ readMarkdown hblogPandocReaderOptions $ T.pack body 104 return $ concat [titleQuery pandocEither file, headersQuery pandocEither file] 105 106 -- Turn titles into Targets 107 titleQuery :: Either PandocError Pandoc -> FilePath -> [Target] 108 titleQuery (Right (Pandoc meta _)) file = 109 [TTitle { tFile = file, tTarget = target }] 110 where 111 target = case runPure $ writePlain hblogPandocWriterOptions $ Pandoc nullMeta $ [Plain $ docTitle meta] of 112 Left (PandocSomeError err) -> "rectifier titleQuery: unknown error: " ++ err 113 Left _ -> "rectifier titleQuery: unknown error!" 114 Right item' -> T.unpack item' 115 titleQuery (Left e) file = error $ "Pandoc error! on file " ++ file ++ ": " ++ (show e) 116 117 -- Turn headers into Targets 118 headersQuery :: Either PandocError Pandoc -> FilePath -> [Target] 119 headersQuery (Right x) file = map (\str -> THeader { tFile = file, tTarget = str}) $ query getHeaders x 120 headersQuery (Left e) file = error $ "Pandoc error! on file " ++ file ++ ": " ++ (show e) 121 122 -- Pull headers out as strings 123 getHeaders :: Block -> [String] 124 getHeaders (Header _ _ xs) = 125 [headers] 126 where 127 headers = case runPure $ writePlain hblogPandocWriterOptions $ Pandoc nullMeta $ [Plain xs] of 128 Left (PandocSomeError err) -> "rectifier getHeaders: unknown error: " ++ err 129 Left _ -> "rectifier getHeaders: unknown error!" 130 Right item' -> T.unpack item' 131 getHeaders _ = [] 132 133 -- Basically a wrapper for rectifie that does the IO bits; the code 134 -- below this is pure 135 handleFile :: FilePath -> FilePath -> [Target] -> FilePath -> IO () 136 handleFile indir outdir targets file = do 137 putStrLn $ "Processing file " ++ file 138 let shortname = makeRelative indir file 139 body <- readFile $ indir </> shortname 140 Right mdTemplate <- getDefaultTemplate Nothing "markdown" 141 let newBody = rectify body file targets mdTemplate in do 142 _ <- createDirectoryIfMissing True (takeDirectory $ (outdir </> shortname)) 143 _ <- writeFile (outdir </> shortname) newBody 144 return () 145 146 -- Read, munge, and rebuild the markdown file we're working on 147 rectify :: String -> FilePath -> [Target] -> String -> String 148 rectify body file targets mdTemplate = 149 let pandocEither = runPure $ readMarkdown hblogPandocReaderOptions $ T.pack body 150 newPandoc = linksWalk pandocEither file targets 151 in 152 case runPure $ writeMarkdown hblogPandocWriterOptions { writerTemplate = Just mdTemplate } newPandoc of 153 Left (PandocSomeError err) -> "rectifier rectify: unknown error: " ++ err 154 Left _ -> "rectifier rectify: unknown error!" 155 Right item' -> T.unpack item' 156 157 -- Walk the Pandoc 158 linksWalk :: Either PandocError Pandoc -> FilePath -> [Target] -> Pandoc 159 linksWalk (Right x) file targets = walk (replaceLink file targets) x 160 linksWalk (Left e) file _ = error $ "Pandoc error! on file " ++ file ++ ": " ++ (show e) 161 162 -- Return true if the target is a header in the specified file 163 isFileHeader :: FilePath -> Target -> Bool 164 isFileHeader file target = isHeader target && tFile target == file 165 166 -- Replaces the given link; mostly this is just a wrapper for 167 -- findTarget, but there's some complexity if it's a fragment style 168 -- link 169 replaceLink :: FilePath -> [Target] -> Inline -> Inline 170 replaceLink file targets linky@(Link x y (ickyLinkStr, z)) = 171 let linkStr = unEscapeString ickyLinkStr in 172 -- Step 1 from "Rectification" in DESIGN-CODE : check if it 173 -- looks like a URL or an absolute path within our blog, and 174 -- ignore it in those cases 175 if isURI linkStr || head linkStr == '/' then 176 linky 177 else 178 if (length $ filter (== '#') linkStr) == 1 then 179 -- If it's a fragment (a # link), first check the left side 180 -- of the # and if that matches a title, look in that file 181 -- for header matches 182 let maybeGoodTarget = findTarget (fst $ span (/= '#') linkStr) isTitle targets in 183 case maybeGoodTarget of 184 Nothing -> error $ "Failed to find valid main target (the part before #) for link \"" ++ linkStr ++ "\" in file " ++ file 185 Just goodTarget -> 186 -- Now that we have the file, look for the fragment 187 -- in the headers of that file 188 let maybeGoodFragTarget = findTarget (tail $ snd $ span (/= '#') linkStr) (isFileHeader (tFile goodTarget)) targets in 189 case maybeGoodFragTarget of 190 Nothing -> error $ "Failed to find valid fragment target (the part after #) for link \"" ++ linkStr ++ "\" in file " ++ file 191 Just goodFragTarget -> Link x y ((tTarget goodTarget) ++ "#" ++ (tTarget goodFragTarget), z) 192 else 193 -- No # ; try all the internal file headers first, and then 194 -- all the titles. 195 let twoMaybes = find isJust [ 196 findTarget linkStr (isTitle) targets 197 , findTarget linkStr (isFileHeader file) targets 198 ] in 199 -- FIXME: There *must* be a more idiomatic way to do this 200 case twoMaybes of 201 Nothing -> noTarget linkStr file targets 202 Just Nothing -> noTarget linkStr file targets 203 Just (Just goodTarget) -> Link x y ((tTarget goodTarget), z) 204 replaceLink _ _ x = x 205 206 -- Return true if the target is visible to us, that is, is a title 207 -- or is a header in this same file. 208 isTitleOrFileHeader :: FilePath -> Target -> Bool 209 isTitleOrFileHeader file target = isFileHeader file target || isTitle target 210 211 levenshteinTargetSort :: String -> [Target] -> FilePath -> [Char] 212 levenshteinTargetSort linkStr targets file = 213 intercalate "\n" $ 214 sortBy (compare `on` (levenshteinDistance defaultEditCosts) linkStr) $ 215 map tTarget $ 216 filter (isTitleOrFileHeader file) targets 217 218 noTarget :: [Char] -> [Char] -> [Target] -> a 219 noTarget linkStr file targets = error $ "Failed to find valid target for link \"" ++ linkStr ++ "\" in file " ++ file ++ 220 "\n\nPossible targets include:\n\n" ++ (levenshteinTargetSort linkStr targets file) ++ "\n\n" 221 222 makeFuzzy :: String -> String 223 makeFuzzy input = ".*" ++ (intercalate ".*" $ words input) ++ ".*" 224 225 leftRegex :: String -> String -> Bool 226 leftRegex left right = right =~ (compile (pack left) [caseless]) 227 228 -- This is where we actually walk through the steps from 229 -- "Rectification" in DESIGN-CODE (except step 1); details are there. 230 findTarget :: String -> (Target -> Bool) -> [Target] -> Maybe Target 231 findTarget linkStr myFilter targets = 232 -- We want to stop immediately once we find something that works. 233 let twoMaybes = find isJust [ 234 -- Step 2 235 applyToBothAndFind id tTarget (==) linkStr (filter myFilter targets) 236 -- Step 3 237 , applyToBothAndFind (map toLower) ((map toLower) . tTarget) (==) linkStr (filter myFilter targets) 238 -- Step 4 239 , applyToBothAndFind trimMD (trimMD . tFile) (==) linkStr (filter myFilter targets) 240 -- Step 5 241 , applyToBothAndFind trimMDLower (trimMDLower . tFile) (==) linkStr (filter myFilter targets) 242 -- Step 6 243 , applyToBothAndFind makeFuzzy tTarget (leftRegex) linkStr (filter myFilter targets) 244 , applyToBothAndFind makeFuzzy tFile (leftRegex) linkStr (filter myFilter targets) 245 , Nothing 246 ] in 247 -- FIXME: There *must* be a more idiomatic way to do this 248 case twoMaybes of 249 Nothing -> Nothing 250 Just Nothing -> Nothing 251 Just (Just x) -> Just x 252 253 -- Trim ".md" from the file name 254 trimMD :: FilePath -> String 255 trimMD fname = let maybeStripped = T.stripSuffix ".md" $ T.pack $ (takeBaseName fname) 256 in maybe fname T.unpack maybeStripped 257 258 -- Trim ".md" from the file name and force it to lowercase 259 trimMDLower :: FilePath -> String 260 trimMDLower fname = map toLower (trimMD (takeBaseName fname)) 261 262 -- Take left and right string munging functions, a comparison 263 -- function, the link string we want to rectify, and the targets to 264 -- use to rectify it. Applies the munging functions to the 265 -- appropriate sides of the comparison, and runs the comparison 266 -- function. 267 applyToBothAndFind :: (String -> String) -> (Target -> String) -> (String -> String -> Bool) -> String -> [Target] -> Maybe Target 268 applyToBothAndFind _ _ _ _ [] = Nothing 269 applyToBothAndFind mungeLeft mungeRight comp linkStr xs = 270 let findings = applyToBothAndFindInner mungeLeft mungeRight comp linkStr xs in 271 if length findings > 1 then 272 error $ "Found more than one match for \"" ++ linkStr ++ "\": " ++ (intercalate " " $ map tFile findings) 273 else 274 if length findings < 1 then 275 Nothing 276 else 277 Just $ head findings 278 279 applyToBothAndFindInner :: (String -> String) -> (Target -> String) -> (String -> String -> Bool) -> String -> [Target] -> [Target] 280 applyToBothAndFindInner _ _ _ _ [] = [] 281 applyToBothAndFindInner mungeLeft mungeRight comp linkStr (target:xs) = 282 -- trace ("mlls: " ++ (mungeLeft linkStr)) $ 283 -- trace ("mrt: " ++ (mungeRight target)) $ 284 if (mungeLeft linkStr) `comp` (mungeRight target) then 285 [target] ++ (applyToBothAndFindInner mungeLeft mungeRight comp linkStr xs) 286 else 287 applyToBothAndFindInner mungeLeft mungeRight comp linkStr xs