Skip to content

Commit 012c217

Browse files
committed
KDD code release
1 parent 47db14a commit 012c217

File tree

104 files changed

+12809
-2
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

104 files changed

+12809
-2
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
*.zip
2+
13
# Byte-compiled / optimized / DLL files
24
__pycache__/
35
*.py[cod]
+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
##!/bin/bash
2+
3+
lichesss_raw_dir='/data/chess/bz2/standard/'
4+
output_dir='../../data/player_counts'
5+
mkdir -p $output_dir
6+
7+
for t in $lichesss_raw_dir/*-{01..11}.pgn.bz2 $lichesss_raw_dir/*{3..8}-12.pgn.bz2; do
8+
fname="$(basename -- $t)"
9+
echo "${t} ${output_dir}/${fname}.csv.bz2"
10+
screen -S "filter-${fname}" -dm bash -c "source ~/.bashrc; python3 find_top_players.py ${t} ${output_dir}/${fname}.csv.bz2"
11+
done
+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
##!/bin/bash
2+
3+
lichesss_raw_dir='/data/chess/bz2/standard/'
4+
counts_dir='../../data/player_counts'
5+
counts_file='../../data/player_counts_combined.csv.bz2'
6+
top_list='../../data/player_counts_combined_top_names.csv.bz2'
7+
8+
output_2000_dir='../../data/top_2000_player_games'
9+
output_2000_metadata_dir='../../data/top_2000_player_data'
10+
11+
players_list='../../data/select_transfer_players'
12+
13+
final_data_dir='../../data/transfer_players_data'
14+
15+
num_train=10
16+
num_val=900
17+
num_test=100
18+
19+
python3 combine_player_counts.py $counts_dir/* $counts_file
20+
21+
bzcat $counts_file | head -n 2000 | bzip2 > $top_list
22+
23+
mkdir -p $output_2000_dir
24+
25+
python3 split_by_players.py $top_list $lichesss_raw_dir/*-{01..11}.pgn.bz2 $lichesss_raw_dir/*{3..8}-12.pgn.bz2 $output_2000_dir
26+
27+
rm -v $top_list
28+
29+
mkdir -p $output_2000_metadata_dir
30+
31+
python3 player_game_counts.py $output_2000_dir $output_2000_metadata_dir
32+
33+
python3 select_top_players.py $output_2000_metadata_dir \
34+
${players_list}_train.csv $num_train \
35+
${players_list}_validate.csv $num_val \
36+
${players_list}_test.csv $num_test \
37+
38+
mkdir -p $final_data_dir
39+
mkdir -p $final_data_dir/metadata
40+
cp -v ${players_list}*.csv $final_data_dir/metadata
41+
42+
for c in "train" "validate" "test"; do
43+
mkdir $final_data_dir/${c}
44+
mkdir $final_data_dir/${c}_metadata
45+
for t in `tail -n +2 ${players_list}_${c}.csv|awk -F ',' '{print $1}'`; do
46+
cp -v ${output_2000_dir}/${t}.pgn.bz2 $final_data_dir/${c}
47+
cp ${output_2000_metadata_dir}/${t}.csv.bz2 $final_data_dir/${c}_metadata
48+
done
49+
done
+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
##!/bin/bash
2+
set -e
3+
4+
vals_dat_dir="../../data/transfer_players_data/validate_metadata/"
5+
vals_dir="../../data/transfer_players_validate"
6+
output_dir="../../data/transfer_players_extended"
7+
list_file='../../data/extended_list.csv'
8+
9+
num_per_bin=5
10+
bins="1100 1300 1500 1700 1900"
11+
12+
13+
python3 select_binned_players.py $vals_dat_dir $list_file $num_per_bin $bins
14+
15+
mkdir -p $output_dir
16+
17+
while read player; do
18+
echo $player
19+
cp -r ${vals_dir}/${player} ${output_dir}
20+
done < $list_file

0-player_counting/README.md

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Player Counting
2+
3+
This is the code we used to count the number of games each player has.
+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import backend
2+
3+
import argparse
4+
import bz2
5+
6+
import pandas
7+
8+
@backend.logged_main
9+
def main():
10+
parser = argparse.ArgumentParser(description='Collect counts and create list from them', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
11+
parser.add_argument('inputs', nargs = '+', help='input csvs')
12+
parser.add_argument('output', help='output csv')
13+
args = parser.parse_args()
14+
15+
counts = {}
16+
for p in args.inputs:
17+
backend.printWithDate(f"Processing {p}", end = '\r')
18+
df = pandas.read_csv(p)
19+
for i, row in df.iterrows():
20+
try:
21+
counts[row['player']] += row['count']
22+
except KeyError:
23+
counts[row['player']] = row['count']
24+
backend.printWithDate(f"Writing")
25+
with bz2.open(args.output, 'wt') as f:
26+
f.write('player,count\n')
27+
for p, c in sorted(counts.items(), key = lambda x: x[1], reverse=True):
28+
f.write(f"{p},{c}\n")
29+
30+
if __name__ == '__main__':
31+
main()

0-player_counting/find_top_players.py

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import backend
2+
3+
import argparse
4+
import bz2
5+
6+
@backend.logged_main
7+
def main():
8+
parser = argparse.ArgumentParser(description='Count number of times each player occurs in pgn', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
9+
10+
parser.add_argument('input', help='input pgn')
11+
parser.add_argument('output', help='output csv')
12+
parser.add_argument('--exclude_bullet', action='store_false', help='Remove bullet games from counts')
13+
args = parser.parse_args()
14+
15+
games = backend.GamesFile(args.input)
16+
17+
counts = {}
18+
19+
for i, (d, _) in enumerate(games):
20+
if args.exclude_bullet and 'Bullet' in d['Event']:
21+
continue
22+
else:
23+
add_player(d['White'], counts)
24+
add_player(d['Black'], counts)
25+
if i % 10000 == 0:
26+
backend.printWithDate(f"{i} done with {len(counts)} players from {args.input}", end = '\r')
27+
28+
backend.printWithDate(f"{i} found total of {len(counts)} players from {args.input}")
29+
with bz2.open(args.output, 'wt') as f:
30+
f.write("player,count\n")
31+
for p, c in sorted(counts.items(), key = lambda x: x[1], reverse=True):
32+
f.write(f"{p},{c}\n")
33+
backend.printWithDate("done")
34+
35+
def add_player(p, d):
36+
try:
37+
d[p] += 1
38+
except KeyError:
39+
d[p] = 1
40+
41+
if __name__ == '__main__':
42+
main()
+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import backend
2+
3+
import os
4+
import os.path
5+
import csv
6+
import bz2
7+
import argparse
8+
9+
@backend.logged_main
10+
def main():
11+
parser = argparse.ArgumentParser(description='Get some stats about each of the games')
12+
parser.add_argument('targets_dir', help='input pgns dir')
13+
parser.add_argument('output_dir', help='output csvs dir')
14+
parser.add_argument('--pool_size', type=int, help='Number of models to run in parallel', default = 64)
15+
args = parser.parse_args()
16+
multiProc = backend.Multiproc(args.pool_size)
17+
multiProc.reader_init(Files_lister, args.targets_dir)
18+
multiProc.processor_init(Games_processor, args.output_dir)
19+
20+
multiProc.run()
21+
22+
class Files_lister(backend.MultiprocIterable):
23+
def __init__(self, targets_dir):
24+
self.targets_dir = targets_dir
25+
self.targets = [(p.path, p.name.split('.')[0]) for p in os.scandir(targets_dir) if '.pgn.bz2' in p.name]
26+
backend.printWithDate(f"Found {len(self.targets)} targets in {targets_dir}")
27+
def __next__(self):
28+
try:
29+
backend.printWithDate(f"Pushed target {len(self.targets)} remaining", end = '\r', flush = True)
30+
return self.targets.pop()
31+
except IndexError:
32+
raise StopIteration
33+
34+
class Games_processor(backend.MultiprocWorker):
35+
def __init__(self, output_dir):
36+
self.output_dir = output_dir
37+
38+
def __call__(self, path, name):
39+
games = backend.GamesFile(path)
40+
with bz2.open(os.path.join(self.output_dir, f"{name}.csv.bz2"), 'wt') as f:
41+
writer = csv.DictWriter(f, ["player", "opponent","game_id", "ELO", "opp_ELO", "was_white", "result", "won", "UTCDate", "UTCTime", "TimeControl"])
42+
43+
writer.writeheader()
44+
for d, _ in games:
45+
game_dat = {}
46+
game_dat['player'] = name
47+
game_dat['game_id'] = d['Site'].split('/')[-1]
48+
game_dat['result'] = d['Result']
49+
game_dat['UTCDate'] = d['UTCDate']
50+
game_dat['UTCTime'] = d['UTCTime']
51+
game_dat['TimeControl'] = d['TimeControl']
52+
if d['Black'] == name:
53+
game_dat['was_white'] = False
54+
game_dat['opponent'] = d['White']
55+
game_dat['ELO'] = d['BlackElo']
56+
game_dat['opp_ELO'] = d['WhiteElo']
57+
game_dat['won'] = d['Result'] == '0-1'
58+
else:
59+
game_dat['was_white'] = True
60+
game_dat['opponent'] = d['Black']
61+
game_dat['ELO'] = d['WhiteElo']
62+
game_dat['opp_ELO'] = d['BlackElo']
63+
game_dat['won'] = d['Result'] == '1-0'
64+
writer.writerow(game_dat)
65+
66+
if __name__ == '__main__':
67+
main()
+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import backend
2+
3+
import argparse
4+
import bz2
5+
import glob
6+
import random
7+
import os.path
8+
import multiprocessing
9+
10+
import pandas
11+
12+
@backend.logged_main
13+
def main():
14+
parser = argparse.ArgumentParser(description='Read all the metadata and select top n players for training/validation/testing', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
15+
parser.add_argument('csvs_dir', help='dir of csvs')
16+
parser.add_argument('output_list', help='list of targets')
17+
parser.add_argument('bin_size', type=int, help='players per bin')
18+
parser.add_argument('bins', type=int, nargs = '+', help='bins')
19+
parser.add_argument('--pool_size', type=int, help='Number of threads to use for reading', default = 48)
20+
parser.add_argument('--seed', type=int, help='random seed', default = 1)
21+
args = parser.parse_args()
22+
random.seed(args.seed)
23+
24+
bins = [int(b // 100 * 100) for b in args.bins]
25+
26+
with multiprocessing.Pool(args.pool_size) as pool:
27+
players = pool.map(load_player, glob.glob(os.path.join(args.csvs_dir, '*.csv.bz2')))
28+
backend.printWithDate(f"Found {len(players)} players, using {len(bins)} bins")
29+
binned_players = {b : [] for b in bins}
30+
for p in players:
31+
pe_round = int(p['elo'] // 100 * 100)
32+
if pe_round in bins:
33+
binned_players[pe_round].append(p)
34+
backend.printWithDate(f"Found: " + ', '.join([f"{b} : {len(p)}" for b, p in binned_players.items()]))
35+
36+
with open(args.output_list, 'wt') as f:
37+
for b, p in binned_players.items():
38+
random.shuffle(p)
39+
print(b, [d['name'] for d in p[:args.bin_size]])
40+
f.write('\n'.join([d['name'] for d in p[:args.bin_size]]) +'\n')
41+
42+
def load_player(path):
43+
df = pandas.read_csv(path, low_memory=False)
44+
elo = df['ELO'][-10000:].mean()
45+
count = len(df)
46+
return {
47+
'name' : df['player'].iloc[0],
48+
'elo' : elo,
49+
'count' : count,
50+
}
51+
if __name__ == "__main__":
52+
main()
+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import backend
2+
3+
import argparse
4+
import bz2
5+
import glob
6+
import random
7+
import os.path
8+
import multiprocessing
9+
10+
import pandas
11+
12+
@backend.logged_main
13+
def main():
14+
parser = argparse.ArgumentParser(description='Read all the metadata and select top n players for training/validation/testing', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
15+
parser.add_argument('inputs', help='input csvs dir')
16+
parser.add_argument('output_train', help='output csv for training data')
17+
parser.add_argument('num_train', type=int, help='num for main training')
18+
parser.add_argument('output_val', help='output csv for validation data')
19+
parser.add_argument('num_val', type=int, help='num for big validation run')
20+
parser.add_argument('output_test', help='output csv for testing data')
21+
parser.add_argument('num_test', type=int, help='num for holdout set')
22+
parser.add_argument('--pool_size', type=int, help='Number of models to run in parallel', default = 48)
23+
parser.add_argument('--min_elo', type=int, help='min elo to select', default = 1100)
24+
parser.add_argument('--max_elo', type=int, help='max elo to select', default = 2000)
25+
parser.add_argument('--seed', type=int, help='random seed', default = 1)
26+
args = parser.parse_args()
27+
random.seed(args.seed)
28+
29+
targets = glob.glob(os.path.join(args.inputs, '*csv.bz2'))
30+
31+
with multiprocessing.Pool(args.pool_size) as pool:
32+
players = pool.starmap(check_player, ((t, args.min_elo, args.max_elo) for t in targets))
33+
34+
players_top = sorted(
35+
(p for p in players if p is not None),
36+
key = lambda x : x[1],
37+
reverse=True,
38+
)[:args.num_train + args.num_val + args.num_test]
39+
40+
random.shuffle(players_top)
41+
42+
write_output_file(args.output_train, args.num_train, players_top)
43+
write_output_file(args.output_val, args.num_val, players_top)
44+
write_output_file(args.output_test, args.num_test, players_top)
45+
46+
def write_output_file(path, count, targets):
47+
with open(path, 'wt') as f:
48+
f.write("player,count,ELO\n")
49+
for i in range(count):
50+
t = targets.pop()
51+
f.write(f"{t[0]},{t[1]},{t[2]}\n")
52+
53+
def check_player(path, min_elo, max_elo):
54+
df = pandas.read_csv(path, low_memory=False)
55+
elo = df['ELO'][-10000:].mean()
56+
count = len(df)
57+
if elo > min_elo and elo < max_elo:
58+
return path.split('/')[-1].split('.')[0], count, elo
59+
else:
60+
return None
61+
62+
if __name__ == "__main__":
63+
main()

0 commit comments

Comments
 (0)