From d1e2966ab8d3fd38edfa3e50fae8e26601ef1902 Mon Sep 17 00:00:00 2001 From: dvs1 Date: Mon, 20 Jan 2025 14:50:57 +1000 Subject: Clean up the cleaning up --- SuckItFos | 16 +++++++++------- SuckItPm | 30 ++++++++++++++++-------------- TODO.md | 10 +++++----- 3 files changed, 30 insertions(+), 26 deletions(-) diff --git a/SuckItFos b/SuckItFos index b59f6b0..11adaf9 100755 --- a/SuckItFos +++ b/SuckItFos @@ -10,7 +10,7 @@ filter=" -name TWiki -prune -o \ " -pushd /opt/merged +pushd /opt/mergedWork find /opt/Foswiki/data ${filter} \ -name "*.txt" -type f,l -printf "%P\n" | while read line @@ -26,7 +26,6 @@ do # TODO - try curl, to see what is actually downloaded, and maybe not download unchanged pages. curl to .HTM # Doesn't help with redownloads, coz natch a dynamic site isn't cached. But I can at least comment out the curl command during testing to save time. curl --silent --no-progress-meter ${URL}/${base}/${file}?cover=print -o Foswiki/${base}/${file}.HTM - cp Foswiki/${base}/${file}.HTM Foswiki/${base}/${file}.HTM_ORIGINAL csplit -ks Foswiki/${base}/${file}.HTM '%
%' '/
/' if [ -f xx00 ]; then @@ -58,18 +57,21 @@ do sed -i -E Foswiki/${base}/${file}.md \ -e 's/\$/\$dlr\$/g' \ -e 's/\{#.*\}//g' \ - -e 's/\{\.pattern.*\}//g' \ - -e 's/\{\.pattern.*//g' \ + -e '/^:::/d' \ -e '/^/d' \ - -e '/^:::/d' +# -e 's/\{\.pattern.*\}//g' \ +# -e 's/\{\.pattern.*//g' \ + echo -e "****\n[Original page](${URL}/${base}/${file}) where maybe you can edit it." >> Foswiki/${base}/${file}.md # pandoc -t html -f commonmark_x --self-contained Foswiki/${base}/${file}.md > Foswiki/${base}/${file}.htm # cmark-gfm -t html -e footnotes -e table -e strikethrough Foswiki/${base}/${file}.md > Foswiki/${base}/${file}.body # ln -frs Foswiki/${base}/${file}.body combined/${base}/${file}.body ln -frs Foswiki/${base}/${file}.md combined/${base}/${file}.md -done -notYetAnotherWiki.lua + if [ -f xx01 ]; then + rm xx01 + fi +done popd diff --git a/SuckItPm b/SuckItPm index 156ee9f..a63eb08 100755 --- a/SuckItPm +++ b/SuckItPm @@ -10,20 +10,20 @@ filter=" -not -name ".pageindex" -a \ " -pushd /opt/merged +pushd /opt/mergedWork find /opt/pmwiki/wiki.d ${filter} \ -name "*.*" -type f,l -printf "%P\n" | while read line do base=`echo "${line}" | cut -d '.' -f 1` file=`echo "${line}" | cut -d '.' -f 2` +# page="?n=${line}" mkdir -p PmWiki/$base mkdir -p combined/$base echo "Converting ${URL}/?n=${base}.${file}?action=print -> PmWiki/${base}/${file}.md" # pandoc -f html -t markdown --self-contained ${URL}/?n=${base}.${file} >PmWiki/${base}/${file}.md # TODO - try curl, to see what is actually downloaded, and maybe not download unchanged pages. curl to .HTM # Doesn't help with redownloads, coz natch a dynamic site isn't cached. But I can at least comment out the curl command during testing to save time. -# curl --no-progress-meter ${URL}/?n=${base}.${file} -o PmWiki/${base}/${file}.HTM # curl --no-progress-meter ${URL}/?n=${base}.${file}?action=markdown -o PmWiki/${base}/${file}.MD curl --no-progress-meter ${URL}/?n=${base}.${file}?action=print -o PmWiki/${base}/${file}.HTM cp PmWiki/${base}/${file}.HTM PmWiki/${base}/${file}.HTM_ORIGINAL @@ -44,7 +44,6 @@ do -e "s/class='vspace'//g" \ -e "s/class='wikilink'//g" \ -e "s/style='.*;'//g" -# -e "s/class='.*'//g" \ # -e "s/style='background-color: #.*;'//g" \ # -e "s/style='font-size: .*;'//g" @@ -57,23 +56,26 @@ do -e 's/\$/\$dlr\$/g' \ -e 's/\{#.*\}//g' \ -e '/^:::/d' \ - -e '/\[Site$/d' \ - -e '/^Page last modified on /d' \ - -e '/^\[\]/d' \ - -e "s/\`\`\{=html\}\`<\/a>\`\{=html\}//g" \ - -e "s/^\`\`\{=html\}\`<\/a>\`\{=html\}//g" \ +# -e "s/^\`> PmWiki/${base}/${file}.md + # Don't need this, the parts we are grabbing already include that link at the bottom. +# echo -e "****\n[Original page](${URL}/${base}/${page}) where maybe you can edit it." >> PmWiki/${base}/${file}.md # pandoc -t html -f commonmark_x --self-contained PmWiki/${base}/${file}.md > PmWiki/${base}/${file}.htm # cmark-gfm -t html -e footnotes -e table -e strikethrough PmWiki/${base}/${file}.md > PmWiki/${base}/${file}.body # ln -frs PmWiki/${base}/${file}.body combined/${base}/${file}.body ln -frs PmWiki/${base}/${file}.md combined/${base}/${file}.md -done -notYetAnotherWiki.lua + if [ -f xx01 ]; then + rm xx01 + fi +done popd diff --git a/TODO.md b/TODO.md index 0a748d2..4157083 100644 --- a/TODO.md +++ b/TODO.md @@ -3,21 +3,21 @@ ## Do these Bugs - -- PmWiki in it's current config needs that ?n=foo.bar nonsense for the Original page link. - https://nyaw.wiki.devuan.org/Foswiki/Main/JensKorte/WebPreferences.HTML   "Main" trail not getting whichPage() -- https://nyaw.wiki.devuan.org/PmWiki/Site/Site.HTML   missing everything after the "?action=attr." bit. -- https://nyaw.wiki.devuan.org/PmWiki/Site/EditForm.HTML -- https://nyaw.wiki.devuan.org/PmWiki/Profiles/Debdog.HTML   pandoc can't handle the background table cell colours in the "Background colours" table, which is kinda the point of it. - https://nyaw.wiki.devuan.org/Foswiki/Sandbox/WebHome.HTML   has that twisty thing which looks not simple to remove. - https://nyaw.wiki.devuan.org/Foswiki/Sandbox/WebChanges.HTML - https://nyaw.wiki.devuan.org/Foswiki/Sandbox/WebLeftBarExample.HTML - https://nyaw.wiki.devuan.org/Foswiki/Sandbox/WebPreferences.HTML   has that twisty thing which looks not simple to remove. - https://nyaw.wiki.devuan.org/Foswiki/Sandbox/WebSearch.HTML   has that twisty thing which looks not simple to remove. - https://nyaw.wiki.devuan.org/Foswiki/Main/AdminGroup.HTML -- https://nyaw.wiki.devuan.org/Foswiki/Main/DevuanCluster.HTML   A very lengthy and complex document, I'll likely miss something, but chip away at the obvious. - https://nyaw.wiki.devuan.org/Foswiki/Main/JensKorte/WebLeftBar.HTML - https://nyaw.wiki.devuan.org/Foswiki/Main/WikiGroups.HTML   has that twisty thing which looks not simple to remove. +- https://nyaw.wiki.devuan.org/PmWiki/Site/Site.HTML   missing everything after the "?action=attr." bit. +- https://nyaw.wiki.devuan.org/PmWiki/Site/EditForm.HTML +- https://nyaw.wiki.devuan.org/users/dunno/DevuanCluster.HTML   A very lengthy and complex document, I'll likely miss something, but chip away at the obvious. +- https://nyaw.wiki.devuan.org/users/Debdog.HTML   pandoc can't handle the background table cell colours in the "Background colours" table, which is kinda the point of it. - {.underline} is the result of foo getting lost in translation. +- PmWiki in it's current config needs that ?n=foo.bar nonsense for the Original page link. Which I'm currently neatly sidestepping, the scraped page has a similar thing. Check the timestamps on the files, only update if source is newer than destination. Meh, it's already 600 times faster than the pandoc version. - One quirk to watch for is if a URL path changes, the docs that have that URL need to be redone. -- cgit v1.1