From 747d6044dfcad03f2899f8b68cd37925a63ebbf7 Mon Sep 17 00:00:00 2001 From: Erich Eckner Date: Wed, 9 Jan 2019 16:39:20 +0100 Subject: Initial commit --- .gitignore | 4 ++ dive-into-wikipedia.sh | 22 +++++++++ markov.lpi | 69 ++++++++++++++++++++++++++++ markov.lpr | 122 +++++++++++++++++++++++++++++++++++++++++++++++++ markov.lps | 94 +++++++++++++++++++++++++++++++++++++ remove-tags.sh | 15 ++++++ 6 files changed, 326 insertions(+) create mode 100644 .gitignore create mode 100755 dive-into-wikipedia.sh create mode 100644 markov.lpi create mode 100644 markov.lpr create mode 100644 markov.lps create mode 100755 remove-tags.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2a8f638 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +markov +backup +lib + diff --git a/dive-into-wikipedia.sh b/dive-into-wikipedia.sh new file mode 100755 index 0000000..8088e95 --- /dev/null +++ b/dive-into-wikipedia.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +if [ $# -eq 0 ]; then + "$0" 'https://de.wikipedia.org/wiki/Evolution' +elif [ $# -eq 1 ]; then + printf '%s\n' "$1" + curl -x 'socks5://127.0.0.1:9050' -s "$1" \ + | sed ' + s/>/>\n/g + s/].*,https://de.wikipedia.org/wiki/\2, + t + d + ' \ + | grep -vxF '' +else + parallel -n1 -j0 "$0" +fi \ + | sort -u + diff --git a/markov.lpi b/markov.lpi new file mode 100644 index 0000000..22c2077 --- /dev/null +++ b/markov.lpi @@ -0,0 +1,69 @@ + + + + + + + + + + + + + + <UseAppBundle Value="False"/> + <ResourceType Value="res"/> + </General> + <VersionInfo> + <StringTable ProductVersion=""/> + </VersionInfo> + <BuildModes Count="1"> + <Item1 Name="Default" Default="True"/> + </BuildModes> + <PublishOptions> + <Version Value="2"/> + </PublishOptions> + <RunParams> + <local> + <FormatVersion Value="1"/> + </local> + </RunParams> + <Units Count="3"> + <Unit0> + <Filename Value="markov.lpr"/> + <IsPartOfProject Value="True"/> + </Unit0> + <Unit1> + <Filename Value="remove-tags.sh"/> + <IsPartOfProject Value="True"/> + </Unit1> + <Unit2> + <Filename Value="dive-into-wikipedia.sh"/> + <IsPartOfProject Value="True"/> + </Unit2> + </Units> + </ProjectOptions> + <CompilerOptions> + <Version Value="11"/> + <Target> + <Filename Value="markov"/> + </Target> + <SearchPaths> + <IncludeFiles Value="$(ProjOutDir)"/> + <UnitOutputDirectory Value="lib/$(TargetCPU)-$(TargetOS)"/> + </SearchPaths> + </CompilerOptions> + <Debugging> + <Exceptions Count="3"> + <Item1> + <Name Value="EAbort"/> + </Item1> + <Item2> + <Name Value="ECodetoolError"/> + </Item2> + <Item3> + <Name Value="EFOpenError"/> + </Item3> + </Exceptions> + </Debugging> +</CONFIG> diff --git a/markov.lpr b/markov.lpr new file mode 100644 index 0000000..a9e7507 --- /dev/null +++ b/markov.lpr @@ -0,0 +1,122 @@ +program markov; + +{$mode objfpc}{$H+} + +uses + {$IFDEF UNIX}{$IFDEF UseCThreads} + cthreads, + {$ENDIF}{$ENDIF} + Classes + { you can add units after this }, + sysutils, math; + +var + wordFileName,s: string; + depth,i,j,current: int64; + c: char; + f: textFile; + probabilities: array of extended; + total: extended; + c2i: array[char] of longint; + i2c: string; + +begin + randomize; + if paramCount<>2 then begin + writeln(stderr,'usage: markov word-file chain-depth'); + halt(1); + end; + wordFileName:=paramStr(1); + if not fileExists(wordFileName) then begin + writeln(stderr,'file '''+wordFileName+''' does not exist'); + halt(1); + end; + try + depth:=strToInt(paramstr(2)); + except + writeln(stderr,''''+paramStr(2)+''' is not a valid integer'); + halt(1); + end; + if depth<=0 then begin + writeln(stderr,intToStr(depth)+' is not positive'); + halt(1); + end; + + i2c:='abcdefghijklmnopqrstuvwxyzäöüßABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜ'; + for i:=length(i2c)-1 downto 1 do + for j:=length(i2c) downto i+1 do + if i2c[i]=i2c[j] then + delete(i2c,j,1); + for c:=#0 to #255 do + c2i[c]:=-1; + for i:=1 to length(i2c) do + c2i[i2c[i]]:=i; + + setLength(probabilities,round(power(length(i2c)+1,depth))); + for i:=0 to length(probabilities)-1 do + probabilities[i]:=0; + + assignFile(f,wordFileName); + reset(f); + while not eof(f) do begin + readln(f,s); + current:=0; // reset current index + for i:=1 to length(s) do begin + current:=current*(length(i2c)+1) mod length(probabilities); + if c2i[s[i]]<0 then begin + if current<>0 then // terminate word + probabilities[current]:=probabilities[current]+1; + current:=0; + continue; + end; + current:=current + c2i[s[i]]; + probabilities[current]:=probabilities[current]+1; + end; + end; + closeFile(f); + + for i:=0 to length(probabilities) div (length(i2c)+1)-1 do begin + total:=0; + for j:=0 to length(i2c) do + total:=total + probabilities[j + i * (length(i2c)+1)]; + if total=0 then + continue; + total:=1/total; + for j:=0 to length(i2c) do + probabilities[j + i * (length(i2c)+1)]:= probabilities[j + i * (length(i2c)+1)] * total; + end; + +(* + // debug output + for current:=0 to length(probabilities)-1 do begin + j:=current; + for i:=1 to depth do begin + if (j mod (length(i2c)+1)) = 0 then + write('_') + else + write(i2c[j mod (length(i2c)+1)]); + j:=j div (length(i2c)+1); + end; + writeln(' ',probabilities[current]); + end; *) + + // generation + current:=0; + repeat + current:=current * (length(i2c)+1) mod length(probabilities); + total:=random; + while total>0 do begin + total:=total - probabilities[current]; + inc(current); + end; + dec(current); + if current mod (length(i2c)+1) = 0 then + break + else + write(i2c[current mod (length(i2c)+1)]); + until false; + writeln; + + setLength(probabilities,0); +end. + diff --git a/markov.lps b/markov.lps new file mode 100644 index 0000000..3cbf184 --- /dev/null +++ b/markov.lps @@ -0,0 +1,94 @@ +<?xml version="1.0" encoding="UTF-8"?> +<CONFIG> + <ProjectSession> + <Version Value="10"/> + <BuildModes Active="Default"/> + <Units Count="3"> + <Unit0> + <Filename Value="markov.lpr"/> + <IsPartOfProject Value="True"/> + <TopLine Value="6"/> + <CursorPos X="13" Y="24"/> + <UsageCount Value="20"/> + <Loaded Value="True"/> + </Unit0> + <Unit1> + <Filename Value="remove-tags.sh"/> + <IsPartOfProject Value="True"/> + <IsVisibleTab Value="True"/> + <EditorIndex Value="1"/> + <CursorPos X="42" Y="15"/> + <UsageCount Value="20"/> + <Loaded Value="True"/> + <DefaultSyntaxHighlighter Value="Bash"/> + </Unit1> + <Unit2> + <Filename Value="dive-into-wikipedia.sh"/> + <IsPartOfProject Value="True"/> + <EditorIndex Value="2"/> + <CursorPos X="37" Y="16"/> + <UsageCount Value="20"/> + <Loaded Value="True"/> + <DefaultSyntaxHighlighter Value="Bash"/> + </Unit2> + </Units> + <JumpHistory Count="14" HistoryIndex="13"> + <Position1> + <Filename Value="markov.lpr"/> + <Caret Line="19" Column="12"/> + </Position1> + <Position2> + <Filename Value="markov.lpr"/> + <Caret Line="11" Column="9"/> + </Position2> + <Position3> + <Filename Value="markov.lpr"/> + <Caret Line="47" Column="20" TopLine="9"/> + </Position3> + <Position4> + <Filename Value="markov.lpr"/> + <Caret Line="21" Column="6" TopLine="14"/> + </Position4> + <Position5> + <Filename Value="markov.lpr"/> + <Caret Line="70" Column="28" TopLine="32"/> + </Position5> + <Position6> + <Filename Value="markov.lpr"/> + <Caret Line="49" Column="52" TopLine="38"/> + </Position6> + <Position7> + <Filename Value="markov.lpr"/> + <Caret Line="117" Column="11" TopLine="78"/> + </Position7> + <Position8> + <Filename Value="markov.lpr"/> + <Caret Line="115" Column="45" TopLine="81"/> + </Position8> + <Position9> + <Filename Value="remove-tags.sh"/> + <Caret Line="9"/> + </Position9> + <Position10> + <Filename Value="remove-tags.sh"/> + <Caret Line="15" Column="6"/> + </Position10> + <Position11> + <Filename Value="remove-tags.sh"/> + <Caret Line="9"/> + </Position11> + <Position12> + <Filename Value="remove-tags.sh"/> + <Caret Line="3"/> + </Position12> + <Position13> + <Filename Value="dive-into-wikipedia.sh"/> + <Caret Line="20"/> + </Position13> + <Position14> + <Filename Value="remove-tags.sh"/> + <Caret Line="4"/> + </Position14> + </JumpHistory> + </ProjectSession> +</CONFIG> diff --git a/remove-tags.sh b/remove-tags.sh new file mode 100755 index 0000000..c1a5c1e --- /dev/null +++ b/remove-tags.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +curl -x 'socks5://127.0.0.1:9050' -s "$1" \ + | sed ' + s/>/>\n/g + s/</\n</g + ' \ + | sed -n ' + /^<!--/,/-->$/ d + /^<script[ >]/,\,^</script>$, d + /^<body\(>$\|\s\)/,\,^</body>$, p + ' \ + | sed ' + s/<[^<>]*>//g + ' -- cgit v1.2.3-70-g09d2