summaryrefslogtreecommitdiff
path: root/markov.lpr
blob: adf09db47e999c391c416e099c1d845006d3869a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
program markov;

{$mode objfpc}{$H+}

uses
  {$IFDEF UNIX}{$IFDEF UseCThreads}
  cthreads,
  {$ENDIF}{$ENDIF}
  Classes
  { you can add units after this },
  sysutils, math, mystringlistunit;

var
  wordFileName,s:    string;
  count:             longint;
  depth,i,j,current: int64;
  c:                 char;
  f:                 tMyStringList;
  probabilities:     array of extended;
  total:             extended;
  c2i:               array[char] of longint;
  i2c:               string;

begin
  randomize;
  if paramCount<>3 then begin
    writeln(stderr,'usage: markov word-file chain-depth count');
    halt(1);
  end;
  wordFileName:=paramStr(1);
  if not fileExists(wordFileName) then begin
    writeln(stderr,'file '''+wordFileName+''' does not exist');
    halt(1);
  end;
  try
    depth:=strToInt(paramstr(2));
  except
    writeln(stderr,''''+paramStr(2)+''' is not a valid integer');
    halt(1);
  end;
  if depth<=0 then begin
    writeln(stderr,intToStr(depth)+' is not positive');
    halt(1);
  end;
  try
    count:=strToInt(paramstr(3));
  except
    writeln(stderr,''''+paramStr(3)+''' is not a valid integer');
    halt(1);
  end;
  if count<=0 then begin
    writeln(stderr,intToStr(count)+' is not positive');
    halt(1);
  end;

  i2c:='abcdefghijklmnopqrstuvwxyzäöüßABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜ';
  for i:=length(i2c)-1 downto 1 do
    for j:=length(i2c) downto i+1 do
      if i2c[i]=i2c[j] then
        delete(i2c,j,1);
  for c:=#0 to #255 do
    c2i[c]:=-1;
  for i:=1 to length(i2c) do
    c2i[i2c[i]]:=i;

  setLength(probabilities,round(power(length(i2c)+1,depth)));
  for i:=0 to length(probabilities)-1 do
    probabilities[i]:=0;

  f:=tMyStringList.create;
  f.loadFromFile(wordFileName);
  while f.readln(s) do begin
    s:=s+' '; // lf ends a word, too
    current:=0; // reset current index
    for i:=1 to length(s) do begin
      current:=current*(length(i2c)+1) mod length(probabilities);
      if c2i[s[i]]<0 then begin
        if (current<>0) and (s[i]<>'.') then // terminate word
          probabilities[current]:=probabilities[current]+1;
        current:=0;
        continue;
      end;
      current:=current + c2i[s[i]];
      probabilities[current]:=probabilities[current]+1;
    end;
  end;

  for i:=0 to length(probabilities) div (length(i2c)+1)-1 do begin
    total:=0;
    for j:=0 to length(i2c) do
      total:=total + probabilities[j + i * (length(i2c)+1)];
    if total=0 then
      continue;
    total:=1/total;
    for j:=0 to length(i2c) do
      probabilities[j + i * (length(i2c)+1)]:= probabilities[j + i * (length(i2c)+1)] * total;
  end;

(*
  // debug output
  for current:=0 to length(probabilities)-1 do begin
    j:=current;
    for i:=1 to depth do begin
      if (j mod (length(i2c)+1)) = 0 then
        write('_')
      else
        write(i2c[j mod (length(i2c)+1)]);
      j:=j div (length(i2c)+1);
    end;
    writeln(' ',probabilities[current]);
  end; *)

  // generation
  while count>0 do begin
    current:=0;
    s:='';
    repeat
      current:=current * (length(i2c)+1) mod length(probabilities);
      total:=random;
      while total>0 do begin
        total:=total - probabilities[current];
        inc(current);
      end;
      dec(current);
      if current mod (length(i2c)+1) = 0 then
        break
      else
        s:=s+i2c[current mod (length(i2c)+1)];
    until false;
    if not f.hatZeile(s,false) then begin;
      writeln(s);
      dec(count);
    end;
  end;

  f.free;
  setLength(probabilities,0);
end.