diff options
| -rw-r--r-- | TODO.md | 9 | ||||
| -rwxr-xr-x | notYetAnotherWiki.lua | 174 |
2 files changed, 105 insertions, 78 deletions
| @@ -4,15 +4,6 @@ Make it perphekd! | |||
| 4 | 4 | ||
| 5 | ## Do these | 5 | ## Do these |
| 6 | 6 | ||
| 7 | Check the timestamps on the files, only update if source is newer than destination. Meh, it's already 600 times faster than the pandoc version. | ||
| 8 | |||
| 9 | - One quirk to watch for is if a URL path changes, the docs that have that URL need to be redone. | ||
| 10 | - pandoc is a lot slower though, so do this for sure when dealing with that. | ||
| 11 | - When scraping the web sites, they tend to be dynamically generated with no useful timestamp on them. | ||
| 12 | - The web site scrape happens locally anyway, I can compare source file timestamps. | ||
| 13 | - + So check timestamps when "downloading" the original, and before running pandoc on the result. Think that's the most time consuming steps. | ||
| 14 | - + Since this only stops updates of existing files, URLs changing are not a problem. | ||
| 15 | |||
| 16 | Add atom feed for single page. Alas cgit only seems to have ATOM feed on the whole repo, not individual files. | 7 | Add atom feed for single page. Alas cgit only seems to have ATOM feed on the whole repo, not individual files. |
| 17 | 8 | ||
| 18 | - However, once timestamps are sorted, I can use that code to generate (static?) RSS and ATOM feeds, and create page histories using diffs. | 9 | - However, once timestamps are sorted, I can use that code to generate (static?) RSS and ATOM feeds, and create page histories using diffs. |
diff --git a/notYetAnotherWiki.lua b/notYetAnotherWiki.lua index 53e3682..1cea01b 100755 --- a/notYetAnotherWiki.lua +++ b/notYetAnotherWiki.lua | |||
| @@ -253,27 +253,34 @@ end | |||
| 253 | 253 | ||
| 254 | -- Look for copied pages from the other wikis. | 254 | -- Look for copied pages from the other wikis. |
| 255 | for l in io.popen('find -L ' .. Directory .. ' -name "*.HTM" -type f,l -printf "%P\n"'):lines() do | 255 | for l in io.popen('find -L ' .. Directory .. ' -name "*.HTM" -type f,l -printf "%P\n"'):lines() do |
| 256 | -- print('pandoc converting ' .. l .. ' -> ' .. string.sub(l, 1, -4) .. 'md') | 256 | -- TODO - Only do this if .HTM is newer than .md, or .md doesn't exist. |
| 257 | -- Open the HTM files and do the initial cleanups, then pandoc them. | 257 | |
| 258 | h = io.open(l, 'r') | 258 | local htime = io.popen("date -ur " .. l .. " +%s"):read('l') |
| 259 | if nil ~= h then | 259 | local mtime = io.popen("date -ur " .. string.sub(l, 1, -4) .. "md +%s 2>/dev/null"):read('l') |
| 260 | local body = h:read('*a') ; h:close() | 260 | if (nil == mtime) or (htime > mtime) then |
| 261 | if 'Foswiki' == string.sub(l, 1, 7) then | 261 | print('pandoc converting ' .. l .. ' -> ' .. string.sub(l, 1, -4) .. 'md') |
| 262 | -- Strip out the actual content. | 262 | os.execute('cp ' .. l .. ' ' .. l .. '_ORIGINAL0') |
| 263 | local beg, en = RE.find(body, [['<div id="patternMainContents">']]) if nil ~= beg then body = string.sub(body, en + 1) end | 263 | -- Open the HTM files and do the initial cleanups, then pandoc them. |
| 264 | beg, en = RE.find(body, [['<div class="patternContent">']]) if nil ~= beg then body = string.sub(body, en + 1) end | 264 | h = io.open(l, 'r') |
| 265 | beg, en = RE.find(body, [['<div class="foswikiTopic">']]) if nil ~= beg then | 265 | if nil ~= h then |
| 266 | if ' -- ' == string.sub(body, en + 1, en + 4) then | 266 | local body = h:read('*a') ; h:close() |
| 267 | beg, en = RE.find(body, '[%nl]', en + 4) | 267 | writeString(l .. '_ORIGINAL1', body) |
| 268 | body = string.sub(body, en + 1) | 268 | if 'Foswiki' == string.sub(l, 1, 7) then |
| 269 | -- Strip out the actual content. | ||
| 270 | local beg, en = RE.find(body, [['<div id="patternMainContents">']]) if nil ~= beg then body = string.sub(body, en + 1) end | ||
| 271 | beg, en = RE.find(body, [['<div class="patternContent">']]) if nil ~= beg then body = string.sub(body, en + 1) end | ||
| 272 | beg, en = RE.find(body, [['<div class="foswikiTopic">']]) if nil ~= beg then | ||
| 273 | if ' -- ' == string.sub(body, en + 1, en + 4) then | ||
| 274 | beg, en = RE.find(body, '[%nl]', en + 4) | ||
| 275 | body = string.sub(body, en + 1) | ||
| 276 | end | ||
| 269 | end | 277 | end |
| 270 | end | 278 | beg, en = RE.find(body, [['<div class="patternInfo">']]) if nil ~= beg then body = string.sub(body, 1, beg - 1) end |
| 271 | beg, en = RE.find(body, [['<div class="patternInfo">']]) if nil ~= beg then body = string.sub(body, 1, beg - 1) end | 279 | -- beg, en = RE.find(body, [['<div class="foswikiForm foswikiFormStep">']]) if nil ~= beg then body = string.sub(body, 1, en + 1) end |
| 272 | -- beg, en = RE.find(body, [['<div class="foswikiForm foswikiFormStep">']]) if nil ~= beg then body = string.sub(body, 1, en + 1) end | 280 | beg, en = RE.find(body, [['<div class="foswikiAttachments foswikiFormStep" style="overflow:auto">']]) if nil ~= beg then body = string.sub(body, 1, beg - 1) end |
| 273 | beg, en = RE.find(body, [['<div class="foswikiAttachments foswikiFormStep" style="overflow:auto">']]) if nil ~= beg then body = string.sub(body, 1, beg - 1) end | 281 | beg, en = RE.find(body, [['<div class="foswikiSearchResultsPager">']]) if nil ~= beg then body = string.sub(body, 1, beg - 1) end |
| 274 | beg, en = RE.find(body, [['<div class="foswikiSearchResultsPager">']]) if nil ~= beg then body = string.sub(body, 1, beg - 1) end | 282 | -- Some clean ups. |
| 275 | -- Some clean ups. | 283 | local result = RE.compile( [[{~ |
| 276 | local result = RE.compile( [[{~ | ||
| 277 | ( | 284 | ( |
| 278 | {'class="foswikiCurrentTopicLink"'} -> blank / | 285 | {'class="foswikiCurrentTopicLink"'} -> blank / |
| 279 | {'class="foswikiNewLink"'} -> blank / | 286 | {'class="foswikiNewLink"'} -> blank / |
| @@ -287,41 +294,58 @@ for l in io.popen('find -L ' .. Directory .. ' -name "*.HTM" -type f,l -printf " | |||
| 287 | -- {'style="' ([^"])+ '"'} -> blank / {"style='" ([^'])+ "'"} -> blank / | 294 | -- {'style="' ([^"])+ '"'} -> blank / {"style='" ([^'])+ "'"} -> blank / |
| 288 | . | 295 | . |
| 289 | )* ~}]], { blank = function(a) return '' end } ):match(body) | 296 | )* ~}]], { blank = function(a) return '' end } ):match(body) |
| 290 | body = result | 297 | body = result |
| 291 | -- body = RE.gsub(body, [=[{"<!-- ".*"-->"}]=], '') -- FIXME | 298 | -- body = RE.gsub(body, [=[{"<!-- ".*"-->"}]=], '') -- FIXME |
| 292 | local here = 1 | 299 | local here = 1 |
| 293 | beg, en = RE.find(body, [['https://fos.wiki.devuan.org/']], here) | 300 | beg, en = RE.find(body, [['https://fos.wiki.devuan.org/']], here) |
| 294 | while nil ~= beg do | 301 | while nil ~= beg do |
| 295 | here = beg + 1 | 302 | here = beg + 1 |
| 296 | local beg0, en0 | 303 | local beg0, en0 |
| 297 | local url = nil | 304 | local url = nil |
| 298 | if '"' == string.sub(body, beg - 1, beg - 1) then | 305 | if '"' == string.sub(body, beg - 1, beg - 1) then |
| 299 | beg0, en0 = RE.find(body, [['"']], en) | 306 | beg0, en0 = RE.find(body, [['"']], en) |
| 300 | url = string.sub(body, en + 1, en0 - 1) | 307 | url = string.sub(body, en + 1, en0 - 1) |
| 301 | end | 308 | end |
| 302 | if "'" == string.sub(body, beg - 1, beg - 1) then | 309 | if "'" == string.sub(body, beg - 1, beg - 1) then |
| 303 | beg0, en0 = RE.find(body, [["'"]], en) | 310 | beg0, en0 = RE.find(body, [["'"]], en) |
| 304 | url = string.sub(body, en + 1, en0) | 311 | url = string.sub(body, en + 1, en0) |
| 305 | end | 312 | end |
| 306 | 313 | ||
| 307 | if nil ~= url then | 314 | if nil ~= url then |
| 308 | if ('pub/' == string.sub(url, 1, 4)) then | 315 | if ('pub/' == string.sub(url, 1, 4)) then |
| 309 | -- FIXME? - evil hack? | 316 | -- FIXME? - evil hack? |
| 310 | url = 'Foswiki/' .. url | 317 | url = 'Foswiki/' .. url |
| 311 | else | 318 | --print('FOSWIKI HTM ' .. url) |
| 312 | url = nil | 319 | else |
| 320 | url = nil | ||
| 321 | end | ||
| 313 | end | 322 | end |
| 323 | --print('HTM0 ' .. string.sub(body, beg, en + 84) .. ' \t\t') | ||
| 324 | beg, en, body, here = commonLinky(l, body, 'https://fos.wiki.devuan.org/', url, beg, en, beg0, en0, 1) | ||
| 325 | --if nil ~= en then print('HTM1 ' .. string.sub(body, beg, en + 84) .. ' \t\t') end | ||
| 326 | --[=[ | ||
| 327 | if nil == url then | ||
| 328 | print('OOPS! unknown linky - @' .. l .. '\t\t\t' .. string.sub(body, beg - 9, en) .. ' ' .. string.sub(body, en + 1, en0)) | ||
| 329 | else | ||
| 330 | -- print(' linky - @' .. l .. '\t\t\t' .. string.sub(body, beg - 9, en) .. ' ' .. string.sub(body, en + 1, en0) .. ' -> ' .. url) | ||
| 331 | local md = readMdMd(url, {}) | ||
| 332 | -- if nil ~= md then | ||
| 333 | if nil ~= md.realURL then url = md.realURL end | ||
| 334 | -- end | ||
| 335 | body = string.sub(body, 1, beg - 1) .. url .. string.sub(body, en0 + 1) | ||
| 336 | here = here + #url | ||
| 337 | end | ||
| 338 | beg, en = RE.find(body, [['https://fos.wiki.devuan.org/']], here) | ||
| 339 | ]=] | ||
| 314 | end | 340 | end |
| 315 | beg, en, body, here = commonLinky(l, body, 'https://fos.wiki.devuan.org/', url, beg, en, beg0, en0, 1) | ||
| 316 | end | ||
| 317 | 341 | ||
| 318 | writeString(l .. '_NEW', body) | 342 | writeString(l .. '_NEW', body) |
| 319 | elseif 'PmWiki' == string.sub(l, 1, 6) then | 343 | elseif 'PmWiki' == string.sub(l, 1, 6) then |
| 320 | local beg, en = RE.find(body, [['<!--PageText-->']]) if nil ~= beg then body = string.sub(body, en + 2) end | 344 | local beg, en = RE.find(body, [['<!--PageText-->']]) if nil ~= beg then body = string.sub(body, en + 2) end |
| 321 | beg, en = RE.find(body, [["div id='wikitext'>"]]) if nil ~= beg then body = string.sub(body, en + 2) end | 345 | beg, en = RE.find(body, [["div id='wikitext'>"]]) if nil ~= beg then body = string.sub(body, en + 2) end |
| 322 | beg, en = RE.find(body, [["<div id='printfoot'>"]]) if nil ~= beg then body = string.sub(body, 1, beg - (2 + 9)) end -- There's a </div> to get rid of to. | 346 | beg, en = RE.find(body, [["<div id='printfoot'>"]]) if nil ~= beg then body = string.sub(body, 1, beg - (2 + 9)) end -- There's a </div> to get rid of to. |
| 323 | beg, en = RE.find(body, [['<!--HTMLFooter-->']]) if nil ~= beg then body = string.sub(body, 1, beg - 2) end | 347 | beg, en = RE.find(body, [['<!--HTMLFooter-->']]) if nil ~= beg then body = string.sub(body, 1, beg - 2) end |
| 324 | local result = RE.compile( [[{~ | 348 | local result = RE.compile( [[{~ |
| 325 | ( | 349 | ( |
| 326 | {"class='categorylink'"} -> blank / | 350 | {"class='categorylink'"} -> blank / |
| 327 | {"class='createlink'"} -> blank / | 351 | {"class='createlink'"} -> blank / |
| @@ -341,31 +365,43 @@ for l in io.popen('find -L ' .. Directory .. ' -name "*.HTM" -type f,l -printf " | |||
| 341 | {"<span class='hlt " {([a-z])+} "'></span><pre" } -> "<pre class='%2'" / | 365 | {"<span class='hlt " {([a-z])+} "'></span><pre" } -> "<pre class='%2'" / |
| 342 | . | 366 | . |
| 343 | )* ~}]], { blank = function(a) return '' end } ):match(body) | 367 | )* ~}]], { blank = function(a) return '' end } ):match(body) |
| 344 | body = result | 368 | body = result |
| 345 | here = 1 | 369 | -- body = RE.gsub(body, [=["<a " {([^ >])+} " >"]=], "<a %1>") |
| 346 | beg, en = RE.find(body, [["'https://wiki.devuan.org/"]], here) | 370 | -- DONE? - <span class='hlt html'></span><pre style='background-color: #cc00ff;' class='escaped'> ... lines of HTML code ... </pre> |
| 347 | while nil ~= beg do | 371 | -- most of the time I'll see <pre class='escaped'> |
| 348 | here = beg + 1 | 372 | -- My own looking glass has several. |
| 349 | local beg0, en0 = RE.find(body, [["'"]], en) | 373 | -- Foswiki <pre class='bash'> |
| 374 | -- CommonMark->HTML ---lua <pre><code class="language-lua"> .............................. </code></pre> | ||
| 375 | -- Seems to be the spec way of doing it. | ||
| 376 | -- most of the time I'll see <pre><code> | ||
| 377 | |||
| 378 | here = 1 | ||
| 379 | beg, en = RE.find(body, [["'https://wiki.devuan.org/"]], here) | ||
| 380 | while nil ~= beg do | ||
| 381 | here = beg + 1 | ||
| 382 | local beg0, en0 = RE.find(body, [["'"]], en) | ||
| 350 | -- FIXME? - This might be working around a bug elsewhere. | 383 | -- FIXME? - This might be working around a bug elsewhere. |
| 351 | if "'" == string.sub(body, en0, en0) then en0 = en0 - 1 end | 384 | if "'" == string.sub(body, en0, en0) then en0 = en0 - 1 end |
| 352 | local url = string.sub(body, en + 1, en0) | 385 | local url = string.sub(body, en + 1, en0) |
| 353 | if '?n=' == string.sub(url, 1, 3) then | 386 | if '?n=' == string.sub(url, 1, 3) then |
| 354 | url = string.sub(url, 4):gsub('[%a]+%.([%a-]+)', '%1_pm.HTML') | 387 | url = string.sub(url, 4):gsub('[%a]+%.([%a-]+)', '%1_pm.HTML') |
| 355 | elseif ("'" == url) or ('uploads/' == string.sub(url, 1, 8)) then | 388 | elseif ("'" == url) or ('uploads/' == string.sub(url, 1, 8)) then |
| 356 | -- FIXME - evil hack? Yep, evil hack, need to know the depth of the source, which isn't here. | 389 | -- FIXME - evil hack? Yep, evil hack, need to know the depth of the source, which isn't here. |
| 357 | url = 'PmWiki/' .. url | 390 | url = 'PmWiki/' .. url |
| 358 | else | 391 | else |
| 359 | url = nil | 392 | url = nil |
| 393 | end | ||
| 394 | --print('HTM0 ' .. string.sub(body, beg, en + 84) .. ' \t\t') | ||
| 395 | beg, en, body, here = commonLinky(l, body, "'https://wiki.devuan.org/", url, beg, en, beg0, en0, 0) | ||
| 396 | --if nil ~= en then print('HTM1 ' .. string.sub(body, beg, en + 84) .. ' \t\t') end | ||
| 360 | end | 397 | end |
| 361 | beg, en, body, here = commonLinky(l, body, "'https://wiki.devuan.org/", url, beg, en, beg0, en0, 0) | ||
| 362 | end | ||
| 363 | 398 | ||
| 364 | writeString(l .. '_NEW', body) | 399 | writeString(l .. '_NEW', body) |
| 400 | end | ||
| 365 | end | 401 | end |
| 366 | end | ||
| 367 | 402 | ||
| 368 | ok, rslt, status = os.execute('pandoc --wrap=preserve -f html -t commonmark_x --self-contained ' .. l .. '_NEW' .. ' >' .. string.sub(l, 1, -4) .. 'md') | 403 | ok, rslt, status = os.execute('pandoc --wrap=preserve -f html -t commonmark_x --self-contained ' .. l .. '_NEW' .. ' >' .. string.sub(l, 1, -4) .. 'md') |
| 404 | end | ||
| 369 | end | 405 | end |
| 370 | 406 | ||
| 371 | if '.' ~= Directory then | 407 | if '.' ~= Directory then |
