Skip to content

Commit

Permalink
Refactor dynamic scoring (#2416)
Browse files Browse the repository at this point in the history
* The _translate method of the ScoringPreparator class relies on a dynamic dataset iterator.
  • Loading branch information
l-k-11235 committed Jun 23, 2023
1 parent 3afced5 commit f8740f1
Show file tree
Hide file tree
Showing 7 changed files with 117 additions and 318 deletions.
43 changes: 6 additions & 37 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,10 @@ jobs:
-hidden_size 10 \
-train_steps 10 -valid_steps 5 \
-tensorboard "true" \
-tensorboard_log_dir /tmp/logs_train_valid \
-tensorboard_log_dir /tmp/logs_train_and_valid \
-copy_attn
python onmt/tests/test_events.py --logdir /tmp/logs_train_valid -tensorboard_checks train_valid
python onmt/tests/test_events.py --logdir /tmp/logs_train_and_valid -tensorboard_checks train
python onmt/tests/test_events.py --logdir /tmp/logs_train_and_valid -tensorboard_checks valid
- name: Test RNN training with coverage
run: |
python train.py \
Expand Down Expand Up @@ -141,35 +142,6 @@ jobs:
-attention_dropout 0.2 0.1 0.1 \
-report_every 5 \
-train_steps 10
- name : Test Transformer training with dynamic scoring
run: |
python3 train.py \
-config data/data.yaml \
-src_vocab /tmp/onmt.vocab.src \
-tgt_vocab /tmp/onmt.vocab.tgt \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-encoder_type transformer \
-decoder_type transformer \
-layers 4 \
-word_vec_size 16 \
-hidden_size 16 \
-num_workers 0 -bucket_size 1024 \
-heads 2 \
-transformer_ff 64 \
-num_workers 0 -bucket_size 1024 \
-accum_count 2 4 8 \
-accum_steps 0 15000 30000 \
-save_model /tmp/onmt.model \
-train_steps 20 \
-report_every 5 \
-train_eval_steps 10 \
-train_metrics "BLEU" "TER" \
-tensorboard "true" \
-scoring_debug "true" \
-tensorboard_log_dir /tmp/logs_train_metrics \
-dump_preds /tmp/dump_preds
python onmt/tests/test_events.py --logdir /tmp/logs_train_metrics -tensorboard_checks train_metrics
- name : Test Transformer training and validation with dynamic scoring and copy
run: |
python3 train.py \
Expand All @@ -192,15 +164,13 @@ jobs:
-save_model /tmp/onmt.model \
-train_steps 10 -valid_steps 5 \
-report_every 2 \
-train_eval_steps 8 \
-train_metrics "BLEU" "TER" \
-valid_metrics "BLEU" "TER" \
-tensorboard "true" \
-scoring_debug "true" \
-tensorboard_log_dir /tmp/logs_train_valid_metrics \
-tensorboard_log_dir /tmp/logs_dynamic-scoring_and_copy \
-dump_preds /tmp/dump_preds \
-copy_attn
python onmt/tests/test_events.py --logdir /tmp/logs_train_valid_metrics -tensorboard_checks train_valid_metrics
python onmt/tests/test_events.py --logdir /tmp/logs_dynamic-scoring_and_copy -tensorboard_checks valid_metrics
- name: Test LM training
run: |
python train.py \
Expand Down Expand Up @@ -279,8 +249,7 @@ jobs:
-hidden_size 2 -batch_size 10 \
-word_vec_size 5 -hidden_size 10 \
-num_workers 0 -bucket_size 1024 \
-report_every 5 -train_steps 10 \
-train_metrics "BLEU" "TER" \
-report_every 5 -train_steps 10 -valid_steps 5\
-valid_metrics "BLEU" "TER" \
-save_model /tmp/onmt.model \
-save_checkpoint_steps 10
Expand Down
14 changes: 0 additions & 14 deletions onmt/opts.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,20 +55,6 @@ def _add_logging_opts(parser, is_train=True):
)

if is_train:
group.add(
"--train_eval_steps",
"-train_eval_steps",
type=int,
default=200,
help="calculate training metrics at this interval",
)
group.add(
"--train_metrics",
"-train_metrics",
default=[],
nargs="+",
help="List of names of additional training metrics",
)
group.add(
"--valid_metrics",
"-valid_metrics",
Expand Down
56 changes: 15 additions & 41 deletions onmt/tests/pull_request_chk.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ ${PYTHON} -m unittest discover >> ${LOG_FILE} 2>&1
echo "Succeeded" | tee -a ${LOG_FILE}


#

# Get Vocabulary test
#

echo -n "[+] Testing vocabulary building..."
PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH} ${PYTHON} onmt/bin/build_vocab.py \
-config ${DATA_DIR}/data.yaml \
Expand Down Expand Up @@ -138,12 +138,13 @@ ${PYTHON} onmt/bin/train.py \
-word_vec_size 5 -report_every 2 \
-hidden_size 10 -train_steps 10 -valid_steps 5 \
-tensorboard "true" \
-tensorboard_log_dir $TMP_OUT_DIR/logs_train_valid \
-tensorboard_log_dir $TMP_OUT_DIR/logs_train_and_valid \
-copy_attn >> ${LOG_FILE} 2>&1
${PYTHON} onmt/tests/test_events.py --logdir $TMP_OUT_DIR/logs_train_valid -tensorboard_checks train_valid
${PYTHON} onmt/tests/test_events.py --logdir $TMP_OUT_DIR/logs_train_and_valid -tensorboard_checks train
${PYTHON} onmt/tests/test_events.py --logdir $TMP_OUT_DIR/logs_train_and_valid -tensorboard_checks valid
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm -r $TMP_OUT_DIR/logs_train_valid
rm -r $TMP_OUT_DIR/logs_train_and_valid

echo -n " [+] Testing NMT training w/ align..."
${PYTHON} onmt/bin/train.py \
Expand Down Expand Up @@ -175,35 +176,8 @@ ${PYTHON} onmt/bin/train.py \
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}

echo -n " [+] Testing NMT training w/ dynamic scoring..."
${PYTHON} onmt/bin/train.py \
-config ${DATA_DIR}/data.yaml \
-src_vocab $TMP_OUT_DIR/onmt.vocab.src \
-tgt_vocab $TMP_OUT_DIR/onmt.vocab.tgt \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-encoder_type transformer \
-decoder_type transformer \
-layers 4 \
-word_vec_size 16 \
-hidden_size 16 \
-heads 2 \
-transformer_ff 64 \
-num_workers 0 -bucket_size 1024 \
-train_steps 20 \
-report_every 5 \
-train_eval_steps 10 \
-train_metrics "BLEU" "TER" \
-tensorboard "true" \
-scoring_debug "true" \
-tensorboard_log_dir $TMP_OUT_DIR/logs_train_metrics \
-dump_preds $TMP_OUT_DIR/dump_pred >> ${LOG_FILE} 2>&1
${PYTHON} onmt/tests/test_events.py --logdir $TMP_OUT_DIR/logs_train_metrics -tensorboard_checks train_metrics
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm -r $TMP_OUT_DIR/logs_train_metrics

echo -n " [+] Testing NMT training w/ dynamic scoring with validation and copy ..."
echo -n " [+] Testing NMT training w/ validation with dynamic scoring and copy ..."
${PYTHON} onmt/bin/train.py \
-config ${DATA_DIR}/data.yaml \
-src_vocab $TMP_OUT_DIR/onmt.vocab.src \
Expand All @@ -221,18 +195,18 @@ ${PYTHON} onmt/bin/train.py \
-bucket_size 1024 \
-train_steps 10 \
-report_every 2 \
-train_eval_steps 8 -valid_steps 5 \
-train_metrics "BLEU" "TER" \
-valid_steps 5 \
-valid_metrics "BLEU" "TER" \
-tensorboard "true" \
-scoring_debug "true" \
-dump_preds $TMP_OUT_DIR/dump_pred \
-copy_attn \
-tensorboard_log_dir $TMP_OUT_DIR/logs_train_valid_metrics >> ${LOG_FILE} 2>&1
${PYTHON} onmt/tests/test_events.py --logdir $TMP_OUT_DIR/logs_train_valid_metrics -tensorboard_checks train_valid_metrics
-dump_preds $TMP_OUT_DIR/dump_pred \
-tensorboard_log_dir $TMP_OUT_DIR/logs_dynamic-scoring_and_copy >> ${LOG_FILE} 2>&1

${PYTHON} onmt/tests/test_events.py --logdir $TMP_OUT_DIR/logs_dynamic-scoring_and_copy -tensorboard_checks valid_metrics
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm -r $TMP_OUT_DIR/logs_train_valid_metrics
rm -r $TMP_OUT_DIR/logs_dynamic-scoring_and_copy

echo -n " [+] Testing LM training..."
${PYTHON} onmt/bin/train.py \
Expand Down Expand Up @@ -374,11 +348,11 @@ ${PYTHON} onmt/bin/train.py \
-batch_size 10 \
-word_vec_size 5 -hidden_size 10 \
-num_workers 0 -bucket_size 1024 \
-report_every 5 -train_steps 10 \
-train_metrics "BLEU" "TER" \
-report_every 5 -train_steps 10 -valid_steps 5\
-valid_metrics "BLEU" "TER" \
-save_model $TMP_OUT_DIR/onmt.features.model \
-save_checkpoint_steps 10 >> ${LOG_FILE} 2>&1

[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm -f $TMP_OUT_DIR/onmt.vocab*
Expand Down
15 changes: 4 additions & 11 deletions onmt/tests/test_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,10 @@ def __init__(self):
metrics = ["BLEU", "TER"]
self.scalars = {}
self.scalars["train"] = [("progress/" + stat) for stat in stats]
self.scalars["train_valid"] = self.scalars["train"] + [
("valid/" + stat) for stat in stats
self.scalars["valid"] = [("valid/" + stat) for stat in stats]
self.scalars["valid_metrics"] = self.scalars["valid"] + [
("valid/" + metric) for metric in metrics
]
self.scalars["train_metrics"] = self.scalars["train"] + [
("progress/" + metric) for metric in metrics
]
self.scalars["train_valid_metrics"] = (
self.scalars["train_metrics"]
+ [("valid/" + stat) for stat in stats]
+ [("valid/" + metric) for metric in metrics]
)

def reload_events(self, path):
ea = event_accumulator.EventAccumulator(
Expand Down Expand Up @@ -49,7 +42,7 @@ def check_scalars(self, scalars, logdir):
"--tensorboard_checks",
type=str,
required=True,
choices=["train", "train_metrics", "train_valid", "train_valid_metrics"],
choices=["train", "valid", "valid_metrics"],
)
args = parser.parse_args()
test_event = TestEvents()
Expand Down
Loading

0 comments on commit f8740f1

Please sign in to comment.