diff --git a/DBE_annual-project.ipynb b/DBE_annual-project.ipynb index 579b089f2f42054a5c2deb6d01de0672a04fa956..f7167f38e7855563d6134de79f8fb25c33e7e723 100644 --- a/DBE_annual-project.ipynb +++ b/DBE_annual-project.ipynb @@ -30,7 +30,7 @@ "metadata": { "id": "TM4RsrfKerFL" }, - "execution_count": null, + "execution_count": 1, "outputs": [] }, { @@ -60,7 +60,7 @@ "metadata": { "id": "uOZWTmCrfmyR" }, - "execution_count": null, + "execution_count": 2, "outputs": [] }, { @@ -69,10 +69,26 @@ "!python -m spacy init fill-config /content/DBE_annual-project/base_config.cfg /content/DBE_annual-project/base_config_spacy.cfg\n" ], "metadata": { - "id": "PcgBSkHnitiO" + "id": "PcgBSkHnitiO", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e935551e-cfcc-41f9-a7f2-9c758a609ede" }, - "execution_count": null, - "outputs": [] + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[38;5;2m✔ Auto-filled config with all values\u001b[0m\n", + "\u001b[38;5;2m✔ Saved config\u001b[0m\n", + "/content/DBE_annual-project/base_config_spacy.cfg\n", + "You can now add your data and train your pipeline:\n", + "python -m spacy train base_config_spacy.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy\n" + ] + } + ] }, { "cell_type": "code", @@ -80,10 +96,57 @@ "!python -m spacy debug data /content/DBE_annual-project/base_config_spacy.cfg" ], "metadata": { - "id": "SqDuveAaHzFp" + "id": "SqDuveAaHzFp", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "8d3ffd0d-9aed-468f-dfdd-ccf70de356cb" }, - "execution_count": null, - "outputs": [] + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[1m\n", + "============================ Data file validation ============================\u001b[0m\n", + "Downloading: 100% 481/481 [00:00<00:00, 441kB/s]\n", + "Downloading: 100% 899k/899k [00:01<00:00, 693kB/s]\n", + "Downloading: 100% 456k/456k [00:01<00:00, 422kB/s]\n", + "Downloading: 100% 1.36M/1.36M [00:01<00:00, 1.03MB/s]\n", + "Downloading: 100% 501M/501M [00:12<00:00, 40.1MB/s]\n", + "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']\n", + "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "\u001b[38;5;2m✔ Pipeline can be initialized with data\u001b[0m\n", + "\u001b[38;5;2m✔ Corpus is loadable\u001b[0m\n", + "\u001b[1m\n", + "=============================== Training stats ===============================\u001b[0m\n", + "Language: en\n", + "Training pipeline: transformer, ner\n", + "150 training docs\n", + "1 evaluation docs\n", + "\u001b[38;5;2m✔ No overlap between training and evaluation data\u001b[0m\n", + "\u001b[38;5;3m⚠Low number of examples to train a new pipeline (150)\u001b[0m\n", + "\u001b[1m\n", + "============================== Vocab & Vectors ==============================\u001b[0m\n", + "\u001b[38;5;4mℹ 5235 total word(s) in the data (1424 unique)\u001b[0m\n", + "\u001b[38;5;4mℹ No word vectors present in the package\u001b[0m\n", + "\u001b[1m\n", + "========================== Named Entity Recognition ==========================\u001b[0m\n", + "\u001b[38;5;4mℹ 1 label(s)\u001b[0m\n", + "0 missing value(s) (tokens with '-' label)\n", + "\u001b[38;5;2m✔ Good amount of examples for all labels\u001b[0m\n", + "\u001b[38;5;2m✔ Examples without occurrences available for all labels\u001b[0m\n", + "\u001b[38;5;2m✔ No entities consisting of or starting/ending with whitespace\u001b[0m\n", + "\u001b[38;5;2m✔ No entities crossing sentence boundaries\u001b[0m\n", + "\u001b[1m\n", + "================================== Summary ==================================\u001b[0m\n", + "\u001b[38;5;2m✔ 7 checks passed\u001b[0m\n", + "\u001b[38;5;3m⚠1 warning\u001b[0m\n" + ] + } + ] }, { "cell_type": "code", @@ -91,22 +154,65 @@ "!python -m spacy train /content/DBE_annual-project/base_config_spacy.cfg --gpu-id 0 --paths.train /content/DBE_annual-project/train.spacy --paths.dev /content/DBE_annual-project/dev.spacy --output ./" ], "metadata": { - "id": "VLxW_Bcci9N1" + "id": "VLxW_Bcci9N1", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e6f96b52-8714-475b-b45e-c5c29a06fd8f" }, - "execution_count": null, - "outputs": [] + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[38;5;4mℹ Saving to output directory: .\u001b[0m\n", + "\u001b[38;5;4mℹ Using GPU: 0\u001b[0m\n", + "\u001b[1m\n", + "=========================== Initializing pipeline ===========================\u001b[0m\n", + "[2023-01-14 11:33:26,302] [INFO] Set up nlp object from config\n", + "INFO:spacy:Set up nlp object from config\n", + "[2023-01-14 11:33:26,312] [INFO] Pipeline: ['transformer', 'ner']\n", + "INFO:spacy:Pipeline: ['transformer', 'ner']\n", + "[2023-01-14 11:33:26,316] [INFO] Created vocabulary\n", + "INFO:spacy:Created vocabulary\n", + "[2023-01-14 11:33:26,316] [INFO] Finished initializing nlp object\n", + "INFO:spacy:Finished initializing nlp object\n", + "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']\n", + "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "[2023-01-14 11:33:43,715] [INFO] Initialized pipeline components: ['transformer', 'ner']\n", + "INFO:spacy:Initialized pipeline components: ['transformer', 'ner']\n", + "\u001b[38;5;2m✔ Initialized pipeline\u001b[0m\n", + "\u001b[1m\n", + "============================= Training pipeline =============================\u001b[0m\n", + "\u001b[38;5;4mℹ Pipeline: ['transformer', 'ner']\u001b[0m\n", + "\u001b[38;5;4mℹ Initial learn rate: 0.0\u001b[0m\n", + "E # LOSS TRANS... LOSS NER ENTS_F ENTS_P ENTS_R SCORE \n", + "--- ------ ------------- -------- ------ ------ ------ ------\n", + " 0 0 6733.00 778.82 0.00 0.00 0.00 0.00\n", + "\n", + "Aborted!\n" + ] + } + ] }, { "cell_type": "code", "source": [ "%%capture \n", - "zipname = \"annual-project.zip\"\n", + "import pytz\n", + "import datetime\n", + "timezone = pytz.timezone(\"Europe/Berlin\")\n", + "timestamp = str(datetime.datetime.now(timezone).strftime(\"%Y_%m_%d__%H_%M_%S\"))\n", + "\n", + "zipname = \"annual-project_\" + str(timestamp) + \".zip\"\n", "!zip -r /content/drive/MyDrive/Colab_files/{zipname} /content/DBE_annual-project" ], "metadata": { "id": "h-k2chKIO-IK" }, - "execution_count": null, + "execution_count": 7, "outputs": [] } ]