summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorErich Eckner <git@eckner.net>2019-01-09 16:39:20 +0100
committerErich Eckner <git@eckner.net>2019-01-09 16:39:20 +0100
commit747d6044dfcad03f2899f8b68cd37925a63ebbf7 (patch)
treea4d60a820e8bea3010e2c22aa9b1aa111d348e95
downloadmarkov-747d6044dfcad03f2899f8b68cd37925a63ebbf7.tar.xz
Initial commit
-rw-r--r--.gitignore4
-rwxr-xr-xdive-into-wikipedia.sh22
-rw-r--r--markov.lpi69
-rw-r--r--markov.lpr122
-rw-r--r--markov.lps94
-rwxr-xr-xremove-tags.sh15
6 files changed, 326 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2a8f638
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+markov
+backup
+lib
+
diff --git a/dive-into-wikipedia.sh b/dive-into-wikipedia.sh
new file mode 100755
index 0000000..8088e95
--- /dev/null
+++ b/dive-into-wikipedia.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+if [ $# -eq 0 ]; then
+ "$0" 'https://de.wikipedia.org/wiki/Evolution'
+elif [ $# -eq 1 ]; then
+ printf '%s\n' "$1"
+ curl -x 'socks5://127.0.0.1:9050' -s "$1" \
+ | sed '
+ s/>/>\n/g
+ s/</\n</g
+ ' \
+ | sed '
+ s,^<a \(.* \)\?href="/wiki/\([^":#]\+\)"[ >].*,https://de.wikipedia.org/wiki/\2,
+ t
+ d
+ ' \
+ | grep -vxF ''
+else
+ parallel -n1 -j0 "$0"
+fi \
+ | sort -u
+
diff --git a/markov.lpi b/markov.lpi
new file mode 100644
index 0000000..22c2077
--- /dev/null
+++ b/markov.lpi
@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<CONFIG>
+ <ProjectOptions>
+ <Version Value="10"/>
+ <General>
+ <Flags>
+ <MainUnitHasCreateFormStatements Value="False"/>
+ <MainUnitHasTitleStatement Value="False"/>
+ <MainUnitHasScaledStatement Value="False"/>
+ </Flags>
+ <SessionStorage Value="InProjectDir"/>
+ <MainUnit Value="0"/>
+ <Title Value="markov"/>
+ <UseAppBundle Value="False"/>
+ <ResourceType Value="res"/>
+ </General>
+ <VersionInfo>
+ <StringTable ProductVersion=""/>
+ </VersionInfo>
+ <BuildModes Count="1">
+ <Item1 Name="Default" Default="True"/>
+ </BuildModes>
+ <PublishOptions>
+ <Version Value="2"/>
+ </PublishOptions>
+ <RunParams>
+ <local>
+ <FormatVersion Value="1"/>
+ </local>
+ </RunParams>
+ <Units Count="3">
+ <Unit0>
+ <Filename Value="markov.lpr"/>
+ <IsPartOfProject Value="True"/>
+ </Unit0>
+ <Unit1>
+ <Filename Value="remove-tags.sh"/>
+ <IsPartOfProject Value="True"/>
+ </Unit1>
+ <Unit2>
+ <Filename Value="dive-into-wikipedia.sh"/>
+ <IsPartOfProject Value="True"/>
+ </Unit2>
+ </Units>
+ </ProjectOptions>
+ <CompilerOptions>
+ <Version Value="11"/>
+ <Target>
+ <Filename Value="markov"/>
+ </Target>
+ <SearchPaths>
+ <IncludeFiles Value="$(ProjOutDir)"/>
+ <UnitOutputDirectory Value="lib/$(TargetCPU)-$(TargetOS)"/>
+ </SearchPaths>
+ </CompilerOptions>
+ <Debugging>
+ <Exceptions Count="3">
+ <Item1>
+ <Name Value="EAbort"/>
+ </Item1>
+ <Item2>
+ <Name Value="ECodetoolError"/>
+ </Item2>
+ <Item3>
+ <Name Value="EFOpenError"/>
+ </Item3>
+ </Exceptions>
+ </Debugging>
+</CONFIG>
diff --git a/markov.lpr b/markov.lpr
new file mode 100644
index 0000000..a9e7507
--- /dev/null
+++ b/markov.lpr
@@ -0,0 +1,122 @@
+program markov;
+
+{$mode objfpc}{$H+}
+
+uses
+ {$IFDEF UNIX}{$IFDEF UseCThreads}
+ cthreads,
+ {$ENDIF}{$ENDIF}
+ Classes
+ { you can add units after this },
+ sysutils, math;
+
+var
+ wordFileName,s: string;
+ depth,i,j,current: int64;
+ c: char;
+ f: textFile;
+ probabilities: array of extended;
+ total: extended;
+ c2i: array[char] of longint;
+ i2c: string;
+
+begin
+ randomize;
+ if paramCount<>2 then begin
+ writeln(stderr,'usage: markov word-file chain-depth');
+ halt(1);
+ end;
+ wordFileName:=paramStr(1);
+ if not fileExists(wordFileName) then begin
+ writeln(stderr,'file '''+wordFileName+''' does not exist');
+ halt(1);
+ end;
+ try
+ depth:=strToInt(paramstr(2));
+ except
+ writeln(stderr,''''+paramStr(2)+''' is not a valid integer');
+ halt(1);
+ end;
+ if depth<=0 then begin
+ writeln(stderr,intToStr(depth)+' is not positive');
+ halt(1);
+ end;
+
+ i2c:='abcdefghijklmnopqrstuvwxyzäöüßABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜ';
+ for i:=length(i2c)-1 downto 1 do
+ for j:=length(i2c) downto i+1 do
+ if i2c[i]=i2c[j] then
+ delete(i2c,j,1);
+ for c:=#0 to #255 do
+ c2i[c]:=-1;
+ for i:=1 to length(i2c) do
+ c2i[i2c[i]]:=i;
+
+ setLength(probabilities,round(power(length(i2c)+1,depth)));
+ for i:=0 to length(probabilities)-1 do
+ probabilities[i]:=0;
+
+ assignFile(f,wordFileName);
+ reset(f);
+ while not eof(f) do begin
+ readln(f,s);
+ current:=0; // reset current index
+ for i:=1 to length(s) do begin
+ current:=current*(length(i2c)+1) mod length(probabilities);
+ if c2i[s[i]]<0 then begin
+ if current<>0 then // terminate word
+ probabilities[current]:=probabilities[current]+1;
+ current:=0;
+ continue;
+ end;
+ current:=current + c2i[s[i]];
+ probabilities[current]:=probabilities[current]+1;
+ end;
+ end;
+ closeFile(f);
+
+ for i:=0 to length(probabilities) div (length(i2c)+1)-1 do begin
+ total:=0;
+ for j:=0 to length(i2c) do
+ total:=total + probabilities[j + i * (length(i2c)+1)];
+ if total=0 then
+ continue;
+ total:=1/total;
+ for j:=0 to length(i2c) do
+ probabilities[j + i * (length(i2c)+1)]:= probabilities[j + i * (length(i2c)+1)] * total;
+ end;
+
+(*
+ // debug output
+ for current:=0 to length(probabilities)-1 do begin
+ j:=current;
+ for i:=1 to depth do begin
+ if (j mod (length(i2c)+1)) = 0 then
+ write('_')
+ else
+ write(i2c[j mod (length(i2c)+1)]);
+ j:=j div (length(i2c)+1);
+ end;
+ writeln(' ',probabilities[current]);
+ end; *)
+
+ // generation
+ current:=0;
+ repeat
+ current:=current * (length(i2c)+1) mod length(probabilities);
+ total:=random;
+ while total>0 do begin
+ total:=total - probabilities[current];
+ inc(current);
+ end;
+ dec(current);
+ if current mod (length(i2c)+1) = 0 then
+ break
+ else
+ write(i2c[current mod (length(i2c)+1)]);
+ until false;
+ writeln;
+
+ setLength(probabilities,0);
+end.
+
diff --git a/markov.lps b/markov.lps
new file mode 100644
index 0000000..3cbf184
--- /dev/null
+++ b/markov.lps
@@ -0,0 +1,94 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<CONFIG>
+ <ProjectSession>
+ <Version Value="10"/>
+ <BuildModes Active="Default"/>
+ <Units Count="3">
+ <Unit0>
+ <Filename Value="markov.lpr"/>
+ <IsPartOfProject Value="True"/>
+ <TopLine Value="6"/>
+ <CursorPos X="13" Y="24"/>
+ <UsageCount Value="20"/>
+ <Loaded Value="True"/>
+ </Unit0>
+ <Unit1>
+ <Filename Value="remove-tags.sh"/>
+ <IsPartOfProject Value="True"/>
+ <IsVisibleTab Value="True"/>
+ <EditorIndex Value="1"/>
+ <CursorPos X="42" Y="15"/>
+ <UsageCount Value="20"/>
+ <Loaded Value="True"/>
+ <DefaultSyntaxHighlighter Value="Bash"/>
+ </Unit1>
+ <Unit2>
+ <Filename Value="dive-into-wikipedia.sh"/>
+ <IsPartOfProject Value="True"/>
+ <EditorIndex Value="2"/>
+ <CursorPos X="37" Y="16"/>
+ <UsageCount Value="20"/>
+ <Loaded Value="True"/>
+ <DefaultSyntaxHighlighter Value="Bash"/>
+ </Unit2>
+ </Units>
+ <JumpHistory Count="14" HistoryIndex="13">
+ <Position1>
+ <Filename Value="markov.lpr"/>
+ <Caret Line="19" Column="12"/>
+ </Position1>
+ <Position2>
+ <Filename Value="markov.lpr"/>
+ <Caret Line="11" Column="9"/>
+ </Position2>
+ <Position3>
+ <Filename Value="markov.lpr"/>
+ <Caret Line="47" Column="20" TopLine="9"/>
+ </Position3>
+ <Position4>
+ <Filename Value="markov.lpr"/>
+ <Caret Line="21" Column="6" TopLine="14"/>
+ </Position4>
+ <Position5>
+ <Filename Value="markov.lpr"/>
+ <Caret Line="70" Column="28" TopLine="32"/>
+ </Position5>
+ <Position6>
+ <Filename Value="markov.lpr"/>
+ <Caret Line="49" Column="52" TopLine="38"/>
+ </Position6>
+ <Position7>
+ <Filename Value="markov.lpr"/>
+ <Caret Line="117" Column="11" TopLine="78"/>
+ </Position7>
+ <Position8>
+ <Filename Value="markov.lpr"/>
+ <Caret Line="115" Column="45" TopLine="81"/>
+ </Position8>
+ <Position9>
+ <Filename Value="remove-tags.sh"/>
+ <Caret Line="9"/>
+ </Position9>
+ <Position10>
+ <Filename Value="remove-tags.sh"/>
+ <Caret Line="15" Column="6"/>
+ </Position10>
+ <Position11>
+ <Filename Value="remove-tags.sh"/>
+ <Caret Line="9"/>
+ </Position11>
+ <Position12>
+ <Filename Value="remove-tags.sh"/>
+ <Caret Line="3"/>
+ </Position12>
+ <Position13>
+ <Filename Value="dive-into-wikipedia.sh"/>
+ <Caret Line="20"/>
+ </Position13>
+ <Position14>
+ <Filename Value="remove-tags.sh"/>
+ <Caret Line="4"/>
+ </Position14>
+ </JumpHistory>
+ </ProjectSession>
+</CONFIG>
diff --git a/remove-tags.sh b/remove-tags.sh
new file mode 100755
index 0000000..c1a5c1e
--- /dev/null
+++ b/remove-tags.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+curl -x 'socks5://127.0.0.1:9050' -s "$1" \
+ | sed '
+ s/>/>\n/g
+ s/</\n</g
+ ' \
+ | sed -n '
+ /^<!--/,/-->$/ d
+ /^<script[ >]/,\,^</script>$, d
+ /^<body\(>$\|\s\)/,\,^</body>$, p
+ ' \
+ | sed '
+ s/<[^<>]*>//g
+ '