From 01dc6496d063a8f04f5716cd30098db4d0619481 Mon Sep 17 00:00:00 2001 From: Erich Eckner Date: Wed, 9 Jan 2019 23:48:58 +0100 Subject: geht jetzt --- .gitignore | 3 +- dive-into-wikipedia.sh | 4 +-- generate-random-word-from-wikipedia.sh | 30 ++++++++++++++++++++ markov.lpi | 9 +++--- markov.lpr | 50 ++++++++++++++++++++++------------ markov.lps | 33 +++++++++++++++------- remove-tags.sh | 1 + 7 files changed, 95 insertions(+), 35 deletions(-) create mode 100755 generate-random-word-from-wikipedia.sh diff --git a/.gitignore b/.gitignore index 2a8f638..65bec5c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ +*.bak +.words.* markov backup lib - diff --git a/dive-into-wikipedia.sh b/dive-into-wikipedia.sh index 8088e95..0f594bc 100755 --- a/dive-into-wikipedia.sh +++ b/dive-into-wikipedia.sh @@ -1,7 +1,7 @@ #!/bin/bash if [ $# -eq 0 ]; then - "$0" 'https://de.wikipedia.org/wiki/Evolution' + printf 'https://de.wikipedia.org/wiki/Evolution\n' elif [ $# -eq 1 ]; then printf '%s\n' "$1" curl -x 'socks5://127.0.0.1:9050' -s "$1" \ @@ -16,7 +16,7 @@ elif [ $# -eq 1 ]; then ' \ | grep -vxF '' else - parallel -n1 -j0 "$0" + parallel -n1 -j0 "$0" ::: "$@" fi \ | sort -u diff --git a/generate-random-word-from-wikipedia.sh b/generate-random-word-from-wikipedia.sh new file mode 100755 index 0000000..fbc720f --- /dev/null +++ b/generate-random-word-from-wikipedia.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +base_dir=$(readlink -f "$(dirname "$0")") + +depth=$1 +shift + +if [ ! -s "${base_dir}/.words.${depth}" ]; then + + urls=() + + for i in $(seq "${depth}"); do + urls=( + $( + "${base_dir}/dive-into-wikipedia.sh" "${urls[@]}" \ + | sort -u + ) + ) + echo $i >&2 + done + + printf '%s\n' "${urls[@]}" \ + | parallel -j0 -n1 "${base_dir}/remove-tags.sh" \ + | sort -u \ + | grep -vxF '' \ + > "${base_dir}/.words.${depth}" + +fi + +"${base_dir}/markov" "${base_dir}/.words.${depth}" "$@" diff --git a/markov.lpi b/markov.lpi index 22c2077..71a6713 100644 --- a/markov.lpi +++ b/markov.lpi @@ -14,9 +14,6 @@ - - - @@ -28,7 +25,7 @@ - + @@ -41,6 +38,10 @@ + + + + diff --git a/markov.lpr b/markov.lpr index a9e7507..bc4f6b4 100644 --- a/markov.lpr +++ b/markov.lpr @@ -12,6 +12,7 @@ uses var wordFileName,s: string; + count: longint; depth,i,j,current: int64; c: char; f: textFile; @@ -22,8 +23,8 @@ var begin randomize; - if paramCount<>2 then begin - writeln(stderr,'usage: markov word-file chain-depth'); + if paramCount<>3 then begin + writeln(stderr,'usage: markov word-file chain-depth count'); halt(1); end; wordFileName:=paramStr(1); @@ -41,6 +42,16 @@ begin writeln(stderr,intToStr(depth)+' is not positive'); halt(1); end; + try + count:=strToInt(paramstr(3)); + except + writeln(stderr,''''+paramStr(3)+''' is not a valid integer'); + halt(1); + end; + if count<=0 then begin + writeln(stderr,intToStr(count)+' is not positive'); + halt(1); + end; i2c:='abcdefghijklmnopqrstuvwxyzäöüßABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜ'; for i:=length(i2c)-1 downto 1 do @@ -64,7 +75,7 @@ begin for i:=1 to length(s) do begin current:=current*(length(i2c)+1) mod length(probabilities); if c2i[s[i]]<0 then begin - if current<>0 then // terminate word + if (current<>0) and (s[i]<>'.') then // terminate word probabilities[current]:=probabilities[current]+1; current:=0; continue; @@ -101,21 +112,24 @@ begin end; *) // generation - current:=0; - repeat - current:=current * (length(i2c)+1) mod length(probabilities); - total:=random; - while total>0 do begin - total:=total - probabilities[current]; - inc(current); - end; - dec(current); - if current mod (length(i2c)+1) = 0 then - break - else - write(i2c[current mod (length(i2c)+1)]); - until false; - writeln; + while count>0 do begin + current:=0; + repeat + current:=current * (length(i2c)+1) mod length(probabilities); + total:=random; + while total>0 do begin + total:=total - probabilities[current]; + inc(current); + end; + dec(current); + if current mod (length(i2c)+1) = 0 then + break + else + write(i2c[current mod (length(i2c)+1)]); + until false; + writeln; + dec(count); + end; setLength(probabilities,0); end. diff --git a/markov.lps b/markov.lps index 3cbf184..c2d3cbf 100644 --- a/markov.lps +++ b/markov.lps @@ -3,22 +3,22 @@ - + - - - + + + + - - - + + @@ -26,13 +26,22 @@ - - + + + + + + + + + + + - + @@ -89,6 +98,10 @@ + + + + diff --git a/remove-tags.sh b/remove-tags.sh index c1a5c1e..5f46c92 100755 --- a/remove-tags.sh +++ b/remove-tags.sh @@ -6,6 +6,7 @@ curl -x 'socks5://127.0.0.1:9050' -s "$1" \ s/$/ d /^$/ d /^]/,\,^$, d /^$\|\s\)/,\,^$, p -- cgit v1.2.3-70-g09d2