SuckItPm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81

#!/bin/bash

URL="https://wiki.devuan.org"

filter="
 -not -name "*~" -a \
 -not -name ".flock" -a \
 -not -name ".htaccess" -a \
 -not -name ".lastmod" -a \
 -not -name ".pageindex" -a \
"

pushd /opt/mergedWork

find /opt/pmwiki/wiki.d  ${filter} \
-name "*.*" -type f,l -printf "%P\n" | while read line
do
    base=`echo "${line}" | cut -d '.' -f 1`
    file=`echo "${line}" | cut -d '.' -f 2`
#    page="?n=${line}"
    mkdir -p PmWiki/$base
    mkdir -p combined/$base
    echo "Converting ${URL}/?n=${base}.${file}?action=print -> PmWiki/${base}/${file}.md"
#    pandoc -f html -t markdown --self-contained ${URL}/?n=${base}.${file} >PmWiki/${base}/${file}.md
    # TODO - try curl, to see what is actually downloaded, and maybe not download unchanged pages.  curl to .HTM
    # Doesn't help with redownloads, coz natch a dynamic site isn't cached.  But I can at least comment out the curl command during testing to save time.
#    curl --no-progress-meter ${URL}/?n=${base}.${file}?action=markdown -o PmWiki/${base}/${file}.MD
    curl --no-progress-meter ${URL}/?n=${base}.${file}?action=print -o PmWiki/${base}/${file}.HTM
    cp PmWiki/${base}/${file}.HTM PmWiki/${base}/${file}.HTM_ORIGINAL
    csplit -ks PmWiki/${base}/${file}.HTM '%<!--PageText-->%' '/<!--HTMLFooter-->/'
    if [ -f xx00 ]; then
	rm PmWiki/${base}/${file}.HTM
	mv xx00 PmWiki/${base}/${file}.HTM
    fi
    sed -i -E PmWiki/${base}/${file}.HTM \
	-e "s/rel='nofollow'//g" \
	-e "s/target='_blank'//g" \
	-e "s/class='createlink'//g" \
	-e "s/class='createlinktext'//g" \
	-e "s/class='escaped'//g" \
	-e "s/class='diffmarkup'//g" \
	-e "s/class='selflink'//g" \
	-e "s/class='urllink'//g" \
	-e "s/class='vspace'//g" \
	-e "s/class='wikilink'//g" \
	-e "s/style='.*;'//g"
#	-e "s/style='background-color: #.*;'//g" \
#	-e "s/style='font-size: .*;'//g"

    pandoc -f html -t commonmark_x --self-contained PmWiki//${base}/${file}.HTM >PmWiki/${base}/${file}.md
#    pandoc -f markdown -t commonmark_x --self-contained PmWiki//${base}/${file}.MD >PmWiki/${base}/${file}.md
    cp PmWiki/${base}/${file}.md PmWiki/${base}/${file}.md_ORIGINAL

    # Attempt to clean things up, badly.
    sed -i -E PmWiki/${base}/${file}.md \
	-e 's/\$/\$dlr\$/g' \
	-e 's/\{#.*\}//g' \
	-e '/^:::/d' \
#	-e '/\[Site$/d' \
#	-e '/^Page last modified on /d' \
#	-e '/^\[\]/d' \
#	-e "s/\`<a id='trailstart'>\`\{=html\}\`<\/a>\`\{=html\}//g" \
#	-e "s/^\`<img /<img /g" \
#	-e "s/^\`\`\`\{=html\}//g" \
#	-e "s/^\`\`\`//g" \
#	-e "s/\`\{=html\}//g"

    # Don't need this, the parts we are grabbing already include that link at the bottom.
#    echo -e "****\n[Original page](${URL}/${base}/${page}) where maybe you can edit it." >> PmWiki/${base}/${file}.md

#    pandoc -t html -f commonmark_x --self-contained		PmWiki/${base}/${file}.md > PmWiki/${base}/${file}.htm
#    cmark-gfm -t html -e footnotes -e table -e strikethrough	PmWiki/${base}/${file}.md > PmWiki/${base}/${file}.body
#    ln -frs PmWiki/${base}/${file}.body combined/${base}/${file}.body
    ln -frs PmWiki/${base}/${file}.md combined/${base}/${file}.md

    if [ -f xx01 ]; then
	rm xx01
    fi
done

popd