#!/bin/bash URL="https://wiki.devuan.org" filter=" -not -name "*~" -a \ -not -name ".flock" -a \ -not -name ".htaccess" -a \ -not -name ".lastmod" -a \ -not -name ".pageindex" -a \ " pushd /opt/mergedWork find /opt/pmwiki/wiki.d ${filter} \ -name "*.*" -type f,l -printf "%P\n" | while read line do base=`echo "${line}" | cut -d '.' -f 1` file=`echo "${line}" | cut -d '.' -f 2` # page="?n=${line}" mkdir -p PmWiki/$base mkdir -p combined/$base echo "Converting ${URL}/?n=${base}.${file}?action=print -> PmWiki/${base}/${file}.md" # pandoc -f html -t markdown --self-contained ${URL}/?n=${base}.${file} >PmWiki/${base}/${file}.md # TODO - try curl, to see what is actually downloaded, and maybe not download unchanged pages. curl to .HTM # Doesn't help with redownloads, coz natch a dynamic site isn't cached. But I can at least comment out the curl command during testing to save time. # curl --no-progress-meter ${URL}/?n=${base}.${file}?action=markdown -o PmWiki/${base}/${file}.MD curl --no-progress-meter ${URL}/?n=${base}.${file}?action=print -o PmWiki/${base}/${file}.HTM cp PmWiki/${base}/${file}.HTM PmWiki/${base}/${file}.HTM_ORIGINAL csplit -ks PmWiki/${base}/${file}.HTM '%%' '//' if [ -f xx00 ]; then rm PmWiki/${base}/${file}.HTM mv xx00 PmWiki/${base}/${file}.HTM fi sed -i -E PmWiki/${base}/${file}.HTM \ -e "s/rel='nofollow'//g" \ -e "s/target='_blank'//g" \ -e "s/class='categorylink'//g" \ -e "s/class='createlink'//g" \ -e "s/class='createlinktext'//g" \ -e "s/class='escaped'//g" \ -e "s/class='diffmarkup'//g" \ -e "s/class='selflink'//g" \ -e "s/class='urllink'//g" \ -e "s/class='vspace'//g" \ -e "s/class='wikilink'//g" \ -e "s/style='.*;'//g" # -e "s/style='background-color: #.*;'//g" \ # -e "s/style='font-size: .*;'//g" pandoc -f html -t commonmark_x --self-contained PmWiki//${base}/${file}.HTM >PmWiki/${base}/${file}.md # pandoc -f markdown -t commonmark_x --self-contained PmWiki//${base}/${file}.MD >PmWiki/${base}/${file}.md cp PmWiki/${base}/${file}.md PmWiki/${base}/${file}.md_ORIGINAL # Attempt to clean things up, badly. sed -i -E PmWiki/${base}/${file}.md \ -e 's/\$/\$dlr\$/g' \ -e 's/\{#.*\}//g' \ -e '/^:::/d' \ # -e '/\[Site$/d' \ # -e '/^Page last modified on /d' \ # -e '/^\[\]/d' \ # -e "s/\`\`\{=html\}\`<\/a>\`\{=html\}//g" \ # -e "s/^\`> PmWiki/${base}/${file}.md # pandoc -t html -f commonmark_x --self-contained PmWiki/${base}/${file}.md > PmWiki/${base}/${file}.htm # cmark-gfm -t html -e footnotes -e table -e strikethrough PmWiki/${base}/${file}.md > PmWiki/${base}/${file}.body # ln -frs PmWiki/${base}/${file}.body combined/${base}/${file}.body ln -frs PmWiki/${base}/${file}.md combined/${base}/${file}.md if [ -f xx01 ]; then rm xx01 fi done popd