diff options
-rw-r--r-- | .gitignore | 3 | ||||
-rwxr-xr-x | dive-into-wikipedia.sh | 4 | ||||
-rwxr-xr-x | generate-random-word-from-wikipedia.sh | 30 | ||||
-rw-r--r-- | markov.lpi | 9 | ||||
-rw-r--r-- | markov.lpr | 50 | ||||
-rw-r--r-- | markov.lps | 33 | ||||
-rwxr-xr-x | remove-tags.sh | 1 |
7 files changed, 95 insertions, 35 deletions
@@ -1,4 +1,5 @@ +*.bak +.words.* markov backup lib - diff --git a/dive-into-wikipedia.sh b/dive-into-wikipedia.sh index 8088e95..0f594bc 100755 --- a/dive-into-wikipedia.sh +++ b/dive-into-wikipedia.sh @@ -1,7 +1,7 @@ #!/bin/bash if [ $# -eq 0 ]; then - "$0" 'https://de.wikipedia.org/wiki/Evolution' + printf 'https://de.wikipedia.org/wiki/Evolution\n' elif [ $# -eq 1 ]; then printf '%s\n' "$1" curl -x 'socks5://127.0.0.1:9050' -s "$1" \ @@ -16,7 +16,7 @@ elif [ $# -eq 1 ]; then ' \ | grep -vxF '' else - parallel -n1 -j0 "$0" + parallel -n1 -j0 "$0" ::: "$@" fi \ | sort -u diff --git a/generate-random-word-from-wikipedia.sh b/generate-random-word-from-wikipedia.sh new file mode 100755 index 0000000..fbc720f --- /dev/null +++ b/generate-random-word-from-wikipedia.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +base_dir=$(readlink -f "$(dirname "$0")") + +depth=$1 +shift + +if [ ! -s "${base_dir}/.words.${depth}" ]; then + + urls=() + + for i in $(seq "${depth}"); do + urls=( + $( + "${base_dir}/dive-into-wikipedia.sh" "${urls[@]}" \ + | sort -u + ) + ) + echo $i >&2 + done + + printf '%s\n' "${urls[@]}" \ + | parallel -j0 -n1 "${base_dir}/remove-tags.sh" \ + | sort -u \ + | grep -vxF '' \ + > "${base_dir}/.words.${depth}" + +fi + +"${base_dir}/markov" "${base_dir}/.words.${depth}" "$@" @@ -14,9 +14,6 @@ <UseAppBundle Value="False"/> <ResourceType Value="res"/> </General> - <VersionInfo> - <StringTable ProductVersion=""/> - </VersionInfo> <BuildModes Count="1"> <Item1 Name="Default" Default="True"/> </BuildModes> @@ -28,7 +25,7 @@ <FormatVersion Value="1"/> </local> </RunParams> - <Units Count="3"> + <Units Count="4"> <Unit0> <Filename Value="markov.lpr"/> <IsPartOfProject Value="True"/> @@ -41,6 +38,10 @@ <Filename Value="dive-into-wikipedia.sh"/> <IsPartOfProject Value="True"/> </Unit2> + <Unit3> + <Filename Value="generate-random-word-from-wikipedia.sh"/> + <IsPartOfProject Value="True"/> + </Unit3> </Units> </ProjectOptions> <CompilerOptions> @@ -12,6 +12,7 @@ uses var wordFileName,s: string; + count: longint; depth,i,j,current: int64; c: char; f: textFile; @@ -22,8 +23,8 @@ var begin randomize; - if paramCount<>2 then begin - writeln(stderr,'usage: markov word-file chain-depth'); + if paramCount<>3 then begin + writeln(stderr,'usage: markov word-file chain-depth count'); halt(1); end; wordFileName:=paramStr(1); @@ -41,6 +42,16 @@ begin writeln(stderr,intToStr(depth)+' is not positive'); halt(1); end; + try + count:=strToInt(paramstr(3)); + except + writeln(stderr,''''+paramStr(3)+''' is not a valid integer'); + halt(1); + end; + if count<=0 then begin + writeln(stderr,intToStr(count)+' is not positive'); + halt(1); + end; i2c:='abcdefghijklmnopqrstuvwxyzäöüßABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜ'; for i:=length(i2c)-1 downto 1 do @@ -64,7 +75,7 @@ begin for i:=1 to length(s) do begin current:=current*(length(i2c)+1) mod length(probabilities); if c2i[s[i]]<0 then begin - if current<>0 then // terminate word + if (current<>0) and (s[i]<>'.') then // terminate word probabilities[current]:=probabilities[current]+1; current:=0; continue; @@ -101,21 +112,24 @@ begin end; *) // generation - current:=0; - repeat - current:=current * (length(i2c)+1) mod length(probabilities); - total:=random; - while total>0 do begin - total:=total - probabilities[current]; - inc(current); - end; - dec(current); - if current mod (length(i2c)+1) = 0 then - break - else - write(i2c[current mod (length(i2c)+1)]); - until false; - writeln; + while count>0 do begin + current:=0; + repeat + current:=current * (length(i2c)+1) mod length(probabilities); + total:=random; + while total>0 do begin + total:=total - probabilities[current]; + inc(current); + end; + dec(current); + if current mod (length(i2c)+1) = 0 then + break + else + write(i2c[current mod (length(i2c)+1)]); + until false; + writeln; + dec(count); + end; setLength(probabilities,0); end. @@ -3,22 +3,22 @@ <ProjectSession> <Version Value="10"/> <BuildModes Active="Default"/> - <Units Count="3"> + <Units Count="4"> <Unit0> <Filename Value="markov.lpr"/> <IsPartOfProject Value="True"/> - <TopLine Value="6"/> - <CursorPos X="13" Y="24"/> - <UsageCount Value="20"/> + <IsVisibleTab Value="True"/> + <TopLine Value="54"/> + <CursorPos X="40" Y="78"/> + <UsageCount Value="23"/> <Loaded Value="True"/> </Unit0> <Unit1> <Filename Value="remove-tags.sh"/> <IsPartOfProject Value="True"/> - <IsVisibleTab Value="True"/> <EditorIndex Value="1"/> - <CursorPos X="42" Y="15"/> - <UsageCount Value="20"/> + <CursorPos X="13" Y="9"/> + <UsageCount Value="23"/> <Loaded Value="True"/> <DefaultSyntaxHighlighter Value="Bash"/> </Unit1> @@ -26,13 +26,22 @@ <Filename Value="dive-into-wikipedia.sh"/> <IsPartOfProject Value="True"/> <EditorIndex Value="2"/> - <CursorPos X="37" Y="16"/> - <UsageCount Value="20"/> + <CursorPos X="52" Y="5"/> + <UsageCount Value="23"/> <Loaded Value="True"/> <DefaultSyntaxHighlighter Value="Bash"/> </Unit2> + <Unit3> + <Filename Value="generate-random-word-from-wikipedia.sh"/> + <IsPartOfProject Value="True"/> + <EditorIndex Value="3"/> + <CursorPos X="55" Y="30"/> + <UsageCount Value="23"/> + <Loaded Value="True"/> + <DefaultSyntaxHighlighter Value="Bash"/> + </Unit3> </Units> - <JumpHistory Count="14" HistoryIndex="13"> + <JumpHistory Count="15" HistoryIndex="14"> <Position1> <Filename Value="markov.lpr"/> <Caret Line="19" Column="12"/> @@ -89,6 +98,10 @@ <Filename Value="remove-tags.sh"/> <Caret Line="4"/> </Position14> + <Position15> + <Filename Value="generate-random-word-from-wikipedia.sh"/> + <Caret Line="9"/> + </Position15> </JumpHistory> </ProjectSession> </CONFIG> diff --git a/remove-tags.sh b/remove-tags.sh index c1a5c1e..5f46c92 100755 --- a/remove-tags.sh +++ b/remove-tags.sh @@ -6,6 +6,7 @@ curl -x 'socks5://127.0.0.1:9050' -s "$1" \ s/</\n</g ' \ | sed -n ' + /^<!--.*>$/ d /^<!--/,/-->$/ d /^<script[ >]/,\,^</script>$, d /^<body\(>$\|\s\)/,\,^</body>$, p |