summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorErich Eckner <git@eckner.net>2019-01-09 23:48:58 +0100
committerErich Eckner <git@eckner.net>2019-01-09 23:48:58 +0100
commit01dc6496d063a8f04f5716cd30098db4d0619481 (patch)
tree344470e32d9de88fc1ac676fcb4ec1769706a94e
parent747d6044dfcad03f2899f8b68cd37925a63ebbf7 (diff)
downloadmarkov-01dc6496d063a8f04f5716cd30098db4d0619481.tar.xz
geht jetzt
-rw-r--r--.gitignore3
-rwxr-xr-xdive-into-wikipedia.sh4
-rwxr-xr-xgenerate-random-word-from-wikipedia.sh30
-rw-r--r--markov.lpi9
-rw-r--r--markov.lpr50
-rw-r--r--markov.lps33
-rwxr-xr-xremove-tags.sh1
7 files changed, 95 insertions, 35 deletions
diff --git a/.gitignore b/.gitignore
index 2a8f638..65bec5c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
+*.bak
+.words.*
markov
backup
lib
-
diff --git a/dive-into-wikipedia.sh b/dive-into-wikipedia.sh
index 8088e95..0f594bc 100755
--- a/dive-into-wikipedia.sh
+++ b/dive-into-wikipedia.sh
@@ -1,7 +1,7 @@
#!/bin/bash
if [ $# -eq 0 ]; then
- "$0" 'https://de.wikipedia.org/wiki/Evolution'
+ printf 'https://de.wikipedia.org/wiki/Evolution\n'
elif [ $# -eq 1 ]; then
printf '%s\n' "$1"
curl -x 'socks5://127.0.0.1:9050' -s "$1" \
@@ -16,7 +16,7 @@ elif [ $# -eq 1 ]; then
' \
| grep -vxF ''
else
- parallel -n1 -j0 "$0"
+ parallel -n1 -j0 "$0" ::: "$@"
fi \
| sort -u
diff --git a/generate-random-word-from-wikipedia.sh b/generate-random-word-from-wikipedia.sh
new file mode 100755
index 0000000..fbc720f
--- /dev/null
+++ b/generate-random-word-from-wikipedia.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+base_dir=$(readlink -f "$(dirname "$0")")
+
+depth=$1
+shift
+
+if [ ! -s "${base_dir}/.words.${depth}" ]; then
+
+ urls=()
+
+ for i in $(seq "${depth}"); do
+ urls=(
+ $(
+ "${base_dir}/dive-into-wikipedia.sh" "${urls[@]}" \
+ | sort -u
+ )
+ )
+ echo $i >&2
+ done
+
+ printf '%s\n' "${urls[@]}" \
+ | parallel -j0 -n1 "${base_dir}/remove-tags.sh" \
+ | sort -u \
+ | grep -vxF '' \
+ > "${base_dir}/.words.${depth}"
+
+fi
+
+"${base_dir}/markov" "${base_dir}/.words.${depth}" "$@"
diff --git a/markov.lpi b/markov.lpi
index 22c2077..71a6713 100644
--- a/markov.lpi
+++ b/markov.lpi
@@ -14,9 +14,6 @@
<UseAppBundle Value="False"/>
<ResourceType Value="res"/>
</General>
- <VersionInfo>
- <StringTable ProductVersion=""/>
- </VersionInfo>
<BuildModes Count="1">
<Item1 Name="Default" Default="True"/>
</BuildModes>
@@ -28,7 +25,7 @@
<FormatVersion Value="1"/>
</local>
</RunParams>
- <Units Count="3">
+ <Units Count="4">
<Unit0>
<Filename Value="markov.lpr"/>
<IsPartOfProject Value="True"/>
@@ -41,6 +38,10 @@
<Filename Value="dive-into-wikipedia.sh"/>
<IsPartOfProject Value="True"/>
</Unit2>
+ <Unit3>
+ <Filename Value="generate-random-word-from-wikipedia.sh"/>
+ <IsPartOfProject Value="True"/>
+ </Unit3>
</Units>
</ProjectOptions>
<CompilerOptions>
diff --git a/markov.lpr b/markov.lpr
index a9e7507..bc4f6b4 100644
--- a/markov.lpr
+++ b/markov.lpr
@@ -12,6 +12,7 @@ uses
var
wordFileName,s: string;
+ count: longint;
depth,i,j,current: int64;
c: char;
f: textFile;
@@ -22,8 +23,8 @@ var
begin
randomize;
- if paramCount<>2 then begin
- writeln(stderr,'usage: markov word-file chain-depth');
+ if paramCount<>3 then begin
+ writeln(stderr,'usage: markov word-file chain-depth count');
halt(1);
end;
wordFileName:=paramStr(1);
@@ -41,6 +42,16 @@ begin
writeln(stderr,intToStr(depth)+' is not positive');
halt(1);
end;
+ try
+ count:=strToInt(paramstr(3));
+ except
+ writeln(stderr,''''+paramStr(3)+''' is not a valid integer');
+ halt(1);
+ end;
+ if count<=0 then begin
+ writeln(stderr,intToStr(count)+' is not positive');
+ halt(1);
+ end;
i2c:='abcdefghijklmnopqrstuvwxyzäöüßABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜ';
for i:=length(i2c)-1 downto 1 do
@@ -64,7 +75,7 @@ begin
for i:=1 to length(s) do begin
current:=current*(length(i2c)+1) mod length(probabilities);
if c2i[s[i]]<0 then begin
- if current<>0 then // terminate word
+ if (current<>0) and (s[i]<>'.') then // terminate word
probabilities[current]:=probabilities[current]+1;
current:=0;
continue;
@@ -101,21 +112,24 @@ begin
end; *)
// generation
- current:=0;
- repeat
- current:=current * (length(i2c)+1) mod length(probabilities);
- total:=random;
- while total>0 do begin
- total:=total - probabilities[current];
- inc(current);
- end;
- dec(current);
- if current mod (length(i2c)+1) = 0 then
- break
- else
- write(i2c[current mod (length(i2c)+1)]);
- until false;
- writeln;
+ while count>0 do begin
+ current:=0;
+ repeat
+ current:=current * (length(i2c)+1) mod length(probabilities);
+ total:=random;
+ while total>0 do begin
+ total:=total - probabilities[current];
+ inc(current);
+ end;
+ dec(current);
+ if current mod (length(i2c)+1) = 0 then
+ break
+ else
+ write(i2c[current mod (length(i2c)+1)]);
+ until false;
+ writeln;
+ dec(count);
+ end;
setLength(probabilities,0);
end.
diff --git a/markov.lps b/markov.lps
index 3cbf184..c2d3cbf 100644
--- a/markov.lps
+++ b/markov.lps
@@ -3,22 +3,22 @@
<ProjectSession>
<Version Value="10"/>
<BuildModes Active="Default"/>
- <Units Count="3">
+ <Units Count="4">
<Unit0>
<Filename Value="markov.lpr"/>
<IsPartOfProject Value="True"/>
- <TopLine Value="6"/>
- <CursorPos X="13" Y="24"/>
- <UsageCount Value="20"/>
+ <IsVisibleTab Value="True"/>
+ <TopLine Value="54"/>
+ <CursorPos X="40" Y="78"/>
+ <UsageCount Value="23"/>
<Loaded Value="True"/>
</Unit0>
<Unit1>
<Filename Value="remove-tags.sh"/>
<IsPartOfProject Value="True"/>
- <IsVisibleTab Value="True"/>
<EditorIndex Value="1"/>
- <CursorPos X="42" Y="15"/>
- <UsageCount Value="20"/>
+ <CursorPos X="13" Y="9"/>
+ <UsageCount Value="23"/>
<Loaded Value="True"/>
<DefaultSyntaxHighlighter Value="Bash"/>
</Unit1>
@@ -26,13 +26,22 @@
<Filename Value="dive-into-wikipedia.sh"/>
<IsPartOfProject Value="True"/>
<EditorIndex Value="2"/>
- <CursorPos X="37" Y="16"/>
- <UsageCount Value="20"/>
+ <CursorPos X="52" Y="5"/>
+ <UsageCount Value="23"/>
<Loaded Value="True"/>
<DefaultSyntaxHighlighter Value="Bash"/>
</Unit2>
+ <Unit3>
+ <Filename Value="generate-random-word-from-wikipedia.sh"/>
+ <IsPartOfProject Value="True"/>
+ <EditorIndex Value="3"/>
+ <CursorPos X="55" Y="30"/>
+ <UsageCount Value="23"/>
+ <Loaded Value="True"/>
+ <DefaultSyntaxHighlighter Value="Bash"/>
+ </Unit3>
</Units>
- <JumpHistory Count="14" HistoryIndex="13">
+ <JumpHistory Count="15" HistoryIndex="14">
<Position1>
<Filename Value="markov.lpr"/>
<Caret Line="19" Column="12"/>
@@ -89,6 +98,10 @@
<Filename Value="remove-tags.sh"/>
<Caret Line="4"/>
</Position14>
+ <Position15>
+ <Filename Value="generate-random-word-from-wikipedia.sh"/>
+ <Caret Line="9"/>
+ </Position15>
</JumpHistory>
</ProjectSession>
</CONFIG>
diff --git a/remove-tags.sh b/remove-tags.sh
index c1a5c1e..5f46c92 100755
--- a/remove-tags.sh
+++ b/remove-tags.sh
@@ -6,6 +6,7 @@ curl -x 'socks5://127.0.0.1:9050' -s "$1" \
s/</\n</g
' \
| sed -n '
+ /^<!--.*>$/ d
/^<!--/,/-->$/ d
/^<script[ >]/,\,^</script>$, d
/^<body\(>$\|\s\)/,\,^</body>$, p