diff options
-rw-r--r-- | TODO.md | 9 | ||||
-rwxr-xr-x | notYetAnotherWiki.lua | 174 |
2 files changed, 105 insertions, 78 deletions
@@ -4,15 +4,6 @@ Make it perphekd! | |||
4 | 4 | ||
5 | ## Do these | 5 | ## Do these |
6 | 6 | ||
7 | Check the timestamps on the files, only update if source is newer than destination. Meh, it's already 600 times faster than the pandoc version. | ||
8 | |||
9 | - One quirk to watch for is if a URL path changes, the docs that have that URL need to be redone. | ||
10 | - pandoc is a lot slower though, so do this for sure when dealing with that. | ||
11 | - When scraping the web sites, they tend to be dynamically generated with no useful timestamp on them. | ||
12 | - The web site scrape happens locally anyway, I can compare source file timestamps. | ||
13 | - + So check timestamps when "downloading" the original, and before running pandoc on the result. Think that's the most time consuming steps. | ||
14 | - + Since this only stops updates of existing files, URLs changing are not a problem. | ||
15 | |||
16 | Add atom feed for single page. Alas cgit only seems to have ATOM feed on the whole repo, not individual files. | 7 | Add atom feed for single page. Alas cgit only seems to have ATOM feed on the whole repo, not individual files. |
17 | 8 | ||
18 | - However, once timestamps are sorted, I can use that code to generate (static?) RSS and ATOM feeds, and create page histories using diffs. | 9 | - However, once timestamps are sorted, I can use that code to generate (static?) RSS and ATOM feeds, and create page histories using diffs. |
diff --git a/notYetAnotherWiki.lua b/notYetAnotherWiki.lua index 53e3682..1cea01b 100755 --- a/notYetAnotherWiki.lua +++ b/notYetAnotherWiki.lua | |||
@@ -253,27 +253,34 @@ end | |||
253 | 253 | ||
254 | -- Look for copied pages from the other wikis. | 254 | -- Look for copied pages from the other wikis. |
255 | for l in io.popen('find -L ' .. Directory .. ' -name "*.HTM" -type f,l -printf "%P\n"'):lines() do | 255 | for l in io.popen('find -L ' .. Directory .. ' -name "*.HTM" -type f,l -printf "%P\n"'):lines() do |
256 | -- print('pandoc converting ' .. l .. ' -> ' .. string.sub(l, 1, -4) .. 'md') | 256 | -- TODO - Only do this if .HTM is newer than .md, or .md doesn't exist. |
257 | -- Open the HTM files and do the initial cleanups, then pandoc them. | 257 | |
258 | h = io.open(l, 'r') | 258 | local htime = io.popen("date -ur " .. l .. " +%s"):read('l') |
259 | if nil ~= h then | 259 | local mtime = io.popen("date -ur " .. string.sub(l, 1, -4) .. "md +%s 2>/dev/null"):read('l') |
260 | local body = h:read('*a') ; h:close() | 260 | if (nil == mtime) or (htime > mtime) then |
261 | if 'Foswiki' == string.sub(l, 1, 7) then | 261 | print('pandoc converting ' .. l .. ' -> ' .. string.sub(l, 1, -4) .. 'md') |
262 | -- Strip out the actual content. | 262 | os.execute('cp ' .. l .. ' ' .. l .. '_ORIGINAL0') |
263 | local beg, en = RE.find(body, [['<div id="patternMainContents">']]) if nil ~= beg then body = string.sub(body, en + 1) end | 263 | -- Open the HTM files and do the initial cleanups, then pandoc them. |
264 | beg, en = RE.find(body, [['<div class="patternContent">']]) if nil ~= beg then body = string.sub(body, en + 1) end | 264 | h = io.open(l, 'r') |
265 | beg, en = RE.find(body, [['<div class="foswikiTopic">']]) if nil ~= beg then | 265 | if nil ~= h then |
266 | if ' -- ' == string.sub(body, en + 1, en + 4) then | 266 | local body = h:read('*a') ; h:close() |
267 | beg, en = RE.find(body, '[%nl]', en + 4) | 267 | writeString(l .. '_ORIGINAL1', body) |
268 | body = string.sub(body, en + 1) | 268 | if 'Foswiki' == string.sub(l, 1, 7) then |
269 | -- Strip out the actual content. | ||
270 | local beg, en = RE.find(body, [['<div id="patternMainContents">']]) if nil ~= beg then body = string.sub(body, en + 1) end | ||
271 | beg, en = RE.find(body, [['<div class="patternContent">']]) if nil ~= beg then body = string.sub(body, en + 1) end | ||
272 | beg, en = RE.find(body, [['<div class="foswikiTopic">']]) if nil ~= beg then | ||
273 | if ' -- ' == string.sub(body, en + 1, en + 4) then | ||
274 | beg, en = RE.find(body, '[%nl]', en + 4) | ||
275 | body = string.sub(body, en + 1) | ||
276 | end | ||
269 | end | 277 | end |
270 | end | 278 | beg, en = RE.find(body, [['<div class="patternInfo">']]) if nil ~= beg then body = string.sub(body, 1, beg - 1) end |
271 | beg, en = RE.find(body, [['<div class="patternInfo">']]) if nil ~= beg then body = string.sub(body, 1, beg - 1) end | 279 | -- beg, en = RE.find(body, [['<div class="foswikiForm foswikiFormStep">']]) if nil ~= beg then body = string.sub(body, 1, en + 1) end |
272 | -- beg, en = RE.find(body, [['<div class="foswikiForm foswikiFormStep">']]) if nil ~= beg then body = string.sub(body, 1, en + 1) end | 280 | beg, en = RE.find(body, [['<div class="foswikiAttachments foswikiFormStep" style="overflow:auto">']]) if nil ~= beg then body = string.sub(body, 1, beg - 1) end |
273 | beg, en = RE.find(body, [['<div class="foswikiAttachments foswikiFormStep" style="overflow:auto">']]) if nil ~= beg then body = string.sub(body, 1, beg - 1) end | 281 | beg, en = RE.find(body, [['<div class="foswikiSearchResultsPager">']]) if nil ~= beg then body = string.sub(body, 1, beg - 1) end |
274 | beg, en = RE.find(body, [['<div class="foswikiSearchResultsPager">']]) if nil ~= beg then body = string.sub(body, 1, beg - 1) end | 282 | -- Some clean ups. |
275 | -- Some clean ups. | 283 | local result = RE.compile( [[{~ |
276 | local result = RE.compile( [[{~ | ||
277 | ( | 284 | ( |
278 | {'class="foswikiCurrentTopicLink"'} -> blank / | 285 | {'class="foswikiCurrentTopicLink"'} -> blank / |
279 | {'class="foswikiNewLink"'} -> blank / | 286 | {'class="foswikiNewLink"'} -> blank / |
@@ -287,41 +294,58 @@ for l in io.popen('find -L ' .. Directory .. ' -name "*.HTM" -type f,l -printf " | |||
287 | -- {'style="' ([^"])+ '"'} -> blank / {"style='" ([^'])+ "'"} -> blank / | 294 | -- {'style="' ([^"])+ '"'} -> blank / {"style='" ([^'])+ "'"} -> blank / |
288 | . | 295 | . |
289 | )* ~}]], { blank = function(a) return '' end } ):match(body) | 296 | )* ~}]], { blank = function(a) return '' end } ):match(body) |
290 | body = result | 297 | body = result |
291 | -- body = RE.gsub(body, [=[{"<!-- ".*"-->"}]=], '') -- FIXME | 298 | -- body = RE.gsub(body, [=[{"<!-- ".*"-->"}]=], '') -- FIXME |
292 | local here = 1 | 299 | local here = 1 |
293 | beg, en = RE.find(body, [['https://fos.wiki.devuan.org/']], here) | 300 | beg, en = RE.find(body, [['https://fos.wiki.devuan.org/']], here) |
294 | while nil ~= beg do | 301 | while nil ~= beg do |
295 | here = beg + 1 | 302 | here = beg + 1 |
296 | local beg0, en0 | 303 | local beg0, en0 |
297 | local url = nil | 304 | local url = nil |
298 | if '"' == string.sub(body, beg - 1, beg - 1) then | 305 | if '"' == string.sub(body, beg - 1, beg - 1) then |
299 | beg0, en0 = RE.find(body, [['"']], en) | 306 | beg0, en0 = RE.find(body, [['"']], en) |
300 | url = string.sub(body, en + 1, en0 - 1) | 307 | url = string.sub(body, en + 1, en0 - 1) |
301 | end | 308 | end |
302 | if "'" == string.sub(body, beg - 1, beg - 1) then | 309 | if "'" == string.sub(body, beg - 1, beg - 1) then |
303 | beg0, en0 = RE.find(body, [["'"]], en) | 310 | beg0, en0 = RE.find(body, [["'"]], en) |
304 | url = string.sub(body, en + 1, en0) | 311 | url = string.sub(body, en + 1, en0) |
305 | end | 312 | end |
306 | 313 | ||
307 | if nil ~= url then | 314 | if nil ~= url then |
308 | if ('pub/' == string.sub(url, 1, 4)) then | 315 | if ('pub/' == string.sub(url, 1, 4)) then |
309 | -- FIXME? - evil hack? | 316 | -- FIXME? - evil hack? |
310 | url = 'Foswiki/' .. url | 317 | url = 'Foswiki/' .. url |
311 | else | 318 | --print('FOSWIKI HTM ' .. url) |
312 | url = nil | 319 | else |
320 | url = nil | ||
321 | end | ||
313 | end | 322 | end |
323 | --print('HTM0 ' .. string.sub(body, beg, en + 84) .. ' \t\t') | ||
324 | beg, en, body, here = commonLinky(l, body, 'https://fos.wiki.devuan.org/', url, beg, en, beg0, en0, 1) | ||
325 | --if nil ~= en then print('HTM1 ' .. string.sub(body, beg, en + 84) .. ' \t\t') end | ||
326 | --[=[ | ||
327 | if nil == url then | ||
328 | print('OOPS! unknown linky - @' .. l .. '\t\t\t' .. string.sub(body, beg - 9, en) .. ' ' .. string.sub(body, en + 1, en0)) | ||
329 | else | ||
330 | -- print(' linky - @' .. l .. '\t\t\t' .. string.sub(body, beg - 9, en) .. ' ' .. string.sub(body, en + 1, en0) .. ' -> ' .. url) | ||
331 | local md = readMdMd(url, {}) | ||
332 | -- if nil ~= md then | ||
333 | if nil ~= md.realURL then url = md.realURL end | ||
334 | -- end | ||
335 | body = string.sub(body, 1, beg - 1) .. url .. string.sub(body, en0 + 1) | ||
336 | here = here + #url | ||
337 | end | ||
338 | beg, en = RE.find(body, [['https://fos.wiki.devuan.org/']], here) | ||
339 | ]=] | ||
314 | end | 340 | end |
315 | beg, en, body, here = commonLinky(l, body, 'https://fos.wiki.devuan.org/', url, beg, en, beg0, en0, 1) | ||
316 | end | ||
317 | 341 | ||
318 | writeString(l .. '_NEW', body) | 342 | writeString(l .. '_NEW', body) |
319 | elseif 'PmWiki' == string.sub(l, 1, 6) then | 343 | elseif 'PmWiki' == string.sub(l, 1, 6) then |
320 | local beg, en = RE.find(body, [['<!--PageText-->']]) if nil ~= beg then body = string.sub(body, en + 2) end | 344 | local beg, en = RE.find(body, [['<!--PageText-->']]) if nil ~= beg then body = string.sub(body, en + 2) end |
321 | beg, en = RE.find(body, [["div id='wikitext'>"]]) if nil ~= beg then body = string.sub(body, en + 2) end | 345 | beg, en = RE.find(body, [["div id='wikitext'>"]]) if nil ~= beg then body = string.sub(body, en + 2) end |
322 | beg, en = RE.find(body, [["<div id='printfoot'>"]]) if nil ~= beg then body = string.sub(body, 1, beg - (2 + 9)) end -- There's a </div> to get rid of to. | 346 | beg, en = RE.find(body, [["<div id='printfoot'>"]]) if nil ~= beg then body = string.sub(body, 1, beg - (2 + 9)) end -- There's a </div> to get rid of to. |
323 | beg, en = RE.find(body, [['<!--HTMLFooter-->']]) if nil ~= beg then body = string.sub(body, 1, beg - 2) end | 347 | beg, en = RE.find(body, [['<!--HTMLFooter-->']]) if nil ~= beg then body = string.sub(body, 1, beg - 2) end |
324 | local result = RE.compile( [[{~ | 348 | local result = RE.compile( [[{~ |
325 | ( | 349 | ( |
326 | {"class='categorylink'"} -> blank / | 350 | {"class='categorylink'"} -> blank / |
327 | {"class='createlink'"} -> blank / | 351 | {"class='createlink'"} -> blank / |
@@ -341,31 +365,43 @@ for l in io.popen('find -L ' .. Directory .. ' -name "*.HTM" -type f,l -printf " | |||
341 | {"<span class='hlt " {([a-z])+} "'></span><pre" } -> "<pre class='%2'" / | 365 | {"<span class='hlt " {([a-z])+} "'></span><pre" } -> "<pre class='%2'" / |
342 | . | 366 | . |
343 | )* ~}]], { blank = function(a) return '' end } ):match(body) | 367 | )* ~}]], { blank = function(a) return '' end } ):match(body) |
344 | body = result | 368 | body = result |
345 | here = 1 | 369 | -- body = RE.gsub(body, [=["<a " {([^ >])+} " >"]=], "<a %1>") |
346 | beg, en = RE.find(body, [["'https://wiki.devuan.org/"]], here) | 370 | -- DONE? - <span class='hlt html'></span><pre style='background-color: #cc00ff;' class='escaped'> ... lines of HTML code ... </pre> |
347 | while nil ~= beg do | 371 | -- most of the time I'll see <pre class='escaped'> |
348 | here = beg + 1 | 372 | -- My own looking glass has several. |
349 | local beg0, en0 = RE.find(body, [["'"]], en) | 373 | -- Foswiki <pre class='bash'> |
374 | -- CommonMark->HTML ---lua <pre><code class="language-lua"> .............................. </code></pre> | ||
375 | -- Seems to be the spec way of doing it. | ||
376 | -- most of the time I'll see <pre><code> | ||
377 | |||
378 | here = 1 | ||
379 | beg, en = RE.find(body, [["'https://wiki.devuan.org/"]], here) | ||
380 | while nil ~= beg do | ||
381 | here = beg + 1 | ||
382 | local beg0, en0 = RE.find(body, [["'"]], en) | ||
350 | -- FIXME? - This might be working around a bug elsewhere. | 383 | -- FIXME? - This might be working around a bug elsewhere. |
351 | if "'" == string.sub(body, en0, en0) then en0 = en0 - 1 end | 384 | if "'" == string.sub(body, en0, en0) then en0 = en0 - 1 end |
352 | local url = string.sub(body, en + 1, en0) | 385 | local url = string.sub(body, en + 1, en0) |
353 | if '?n=' == string.sub(url, 1, 3) then | 386 | if '?n=' == string.sub(url, 1, 3) then |
354 | url = string.sub(url, 4):gsub('[%a]+%.([%a-]+)', '%1_pm.HTML') | 387 | url = string.sub(url, 4):gsub('[%a]+%.([%a-]+)', '%1_pm.HTML') |
355 | elseif ("'" == url) or ('uploads/' == string.sub(url, 1, 8)) then | 388 | elseif ("'" == url) or ('uploads/' == string.sub(url, 1, 8)) then |
356 | -- FIXME - evil hack? Yep, evil hack, need to know the depth of the source, which isn't here. | 389 | -- FIXME - evil hack? Yep, evil hack, need to know the depth of the source, which isn't here. |
357 | url = 'PmWiki/' .. url | 390 | url = 'PmWiki/' .. url |
358 | else | 391 | else |
359 | url = nil | 392 | url = nil |
393 | end | ||
394 | --print('HTM0 ' .. string.sub(body, beg, en + 84) .. ' \t\t') | ||
395 | beg, en, body, here = commonLinky(l, body, "'https://wiki.devuan.org/", url, beg, en, beg0, en0, 0) | ||
396 | --if nil ~= en then print('HTM1 ' .. string.sub(body, beg, en + 84) .. ' \t\t') end | ||
360 | end | 397 | end |
361 | beg, en, body, here = commonLinky(l, body, "'https://wiki.devuan.org/", url, beg, en, beg0, en0, 0) | ||
362 | end | ||
363 | 398 | ||
364 | writeString(l .. '_NEW', body) | 399 | writeString(l .. '_NEW', body) |
400 | end | ||
365 | end | 401 | end |
366 | end | ||
367 | 402 | ||
368 | ok, rslt, status = os.execute('pandoc --wrap=preserve -f html -t commonmark_x --self-contained ' .. l .. '_NEW' .. ' >' .. string.sub(l, 1, -4) .. 'md') | 403 | ok, rslt, status = os.execute('pandoc --wrap=preserve -f html -t commonmark_x --self-contained ' .. l .. '_NEW' .. ' >' .. string.sub(l, 1, -4) .. 'md') |
404 | end | ||
369 | end | 405 | end |
370 | 406 | ||
371 | if '.' ~= Directory then | 407 | if '.' ~= Directory then |