{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "601b739b",
"metadata": {
"_cell_guid": "f3fb334d-43c6-4fae-93a0-4b6783397ad9",
"_uuid": "da9ddfdf-832b-4d7d-bb6a-942f879f6447",
"collapsed": false,
"execution": {
"iopub.execute_input": "2024-11-11T01:06:50.055739Z",
"iopub.status.busy": "2024-11-11T01:06:50.055346Z",
"iopub.status.idle": "2024-11-11T01:06:50.059890Z",
"shell.execute_reply": "2024-11-11T01:06:50.059090Z"
},
"id": "gYpPYG55PDy0",
"jupyter": {
"outputs_hidden": false
},
"papermill": {
"duration": 0.02954,
"end_time": "2024-11-11T01:06:50.061890",
"exception": false,
"start_time": "2024-11-11T01:06:50.032350",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"## Try reverse input string (will increase the performance)\n",
"## adopt beam search"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ee5d0131",
"metadata": {
"_cell_guid": "53f3ea36-80e9-4eed-9b09-11fa0490ae70",
"_uuid": "2075d24a-41ad-4336-b551-a0909866836c",
"collapsed": false,
"execution": {
"iopub.execute_input": "2024-11-11T01:06:50.103209Z",
"iopub.status.busy": "2024-11-11T01:06:50.102887Z",
"iopub.status.idle": "2024-11-11T01:07:30.253102Z",
"shell.execute_reply": "2024-11-11T01:07:30.251825Z"
},
"executionInfo": {
"elapsed": 12078,
"status": "ok",
"timestamp": 1731233836806,
"user": {
"displayName": "Abdelrhman Ashraf",
"userId": "11249532378747886614"
},
"user_tz": -120
},
"id": "sRPtnzMJ5DEN",
"jupyter": {
"outputs_hidden": false
},
"outputId": "7a2d1eaf-ddb8-4082-b260-dd52ed452c9b",
"papermill": {
"duration": 40.174693,
"end_time": "2024-11-11T01:07:30.256024",
"exception": false,
"start_time": "2024-11-11T01:06:50.081331",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"## Turn around camel-kenlm wheel error\n",
"!pip install -q future six docopt cachetools numpy scipy pandas scikit-learn torch transformers editdistance requests emoji pyrsistent muddler\n",
"!pip install -q camel-tools --no-deps\n",
"!pip install -q contractions datasets\n",
"\n",
"!pip install -q kaggle"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "d453e83f",
"metadata": {
"_cell_guid": "8aa266b9-fd1e-4344-8d1a-5a7fab478b63",
"_uuid": "426a4f4a-d536-4eb3-a9d7-15e0b265fb63",
"collapsed": false,
"execution": {
"iopub.execute_input": "2024-11-11T01:07:30.296516Z",
"iopub.status.busy": "2024-11-11T01:07:30.296121Z",
"iopub.status.idle": "2024-11-11T01:07:43.286183Z",
"shell.execute_reply": "2024-11-11T01:07:43.284925Z"
},
"executionInfo": {
"elapsed": 264,
"status": "ok",
"timestamp": 1731240458903,
"user": {
"displayName": "Abdelrhman Ashraf",
"userId": "11249532378747886614"
},
"user_tz": -120
},
"id": "oKlf2c2DFqoS",
"jupyter": {
"outputs_hidden": false
},
"papermill": {
"duration": 13.013489,
"end_time": "2024-11-11T01:07:43.288838",
"exception": false,
"start_time": "2024-11-11T01:07:30.275349",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"from tqdm import tqdm\n",
"import time\n",
"from camel_tools.tokenizers.word import simple_word_tokenize\n",
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"from zipfile import ZipFile\n",
"import torch\n",
"from torch import nn\n",
"from torch.nn import functional as F\n",
"from torch.utils.data import DataLoader, Dataset\n",
"import spacy\n",
"from collections import Counter\n",
"import random\n",
"import unicodedata\n",
"import pyarabic.araby as araby\n",
"import contractions\n",
"import nltk\n",
"from datasets import load_dataset\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "83ec4bc2",
"metadata": {
"_cell_guid": "9b098a3a-f196-4d86-85e8-4918c3edbab4",
"_uuid": "eec15a85-e283-4d85-97f5-cb6d80ab078a",
"collapsed": false,
"execution": {
"iopub.execute_input": "2024-11-11T01:07:43.328761Z",
"iopub.status.busy": "2024-11-11T01:07:43.328171Z",
"iopub.status.idle": "2024-11-11T01:07:43.333426Z",
"shell.execute_reply": "2024-11-11T01:07:43.332598Z"
},
"executionInfo": {
"elapsed": 272,
"status": "ok",
"timestamp": 1731234205580,
"user": {
"displayName": "Abdelrhman Ashraf",
"userId": "11249532378747886614"
},
"user_tz": -120
},
"id": "DlvMCmdIWudl",
"jupyter": {
"outputs_hidden": false
},
"papermill": {
"duration": 0.027077,
"end_time": "2024-11-11T01:07:43.335287",
"exception": false,
"start_time": "2024-11-11T01:07:43.308210",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"lr = 1e-3\n",
"epochs = 50\n",
"valid_test_size = 0.3\n",
"# maxlen = 100 # length of one training sample by words\n",
"embd_features = 128 # length of embedding vectors for each word (input_size) (=1000 in paper)\n",
"batch_size = 64\n",
"max_freq = 2 # to add all words to the vocabulary that seen more than one time\n",
"lstm_hidden_size = 128 # The number of features in the hidden state (=1000 in paper)\n",
"lstm_layers = 4 # Number of stacked recurrent layers\n",
"dropout_p = 0.5"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "fc2b096a",
"metadata": {
"_cell_guid": "a421e16e-78a1-45f1-a185-5ede26629408",
"_uuid": "3eb4f197-592d-4d39-8a09-79cc7b939b10",
"collapsed": false,
"execution": {
"iopub.execute_input": "2024-11-11T01:07:43.375090Z",
"iopub.status.busy": "2024-11-11T01:07:43.374478Z",
"iopub.status.idle": "2024-11-11T01:07:43.379619Z",
"shell.execute_reply": "2024-11-11T01:07:43.378747Z"
},
"executionInfo": {
"elapsed": 4,
"status": "ok",
"timestamp": 1731233855441,
"user": {
"displayName": "Abdelrhman Ashraf",
"userId": "11249532378747886614"
},
"user_tz": -120
},
"id": "SPMCS8ajW1jK",
"jupyter": {
"outputs_hidden": false
},
"papermill": {
"duration": 0.027399,
"end_time": "2024-11-11T01:07:43.381624",
"exception": false,
"start_time": "2024-11-11T01:07:43.354225",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"seed = 42\n",
"g = torch.Generator().manual_seed(seed)\n",
"\n",
"device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
"\n",
"isKaggle = True\n",
"base_dir = '/kaggle/working' if isKaggle else '/content'"
]
},
{
"cell_type": "markdown",
"id": "510bed3e",
"metadata": {
"_cell_guid": "2823160c-9473-4388-b059-a8f97ecebc30",
"_uuid": "fee9820e-a6f6-414c-b042-b320f067a2e8",
"collapsed": false,
"id": "jKW1NUp9XA9S",
"jupyter": {
"outputs_hidden": false
},
"papermill": {
"duration": 0.018429,
"end_time": "2024-11-11T01:07:43.418685",
"exception": false,
"start_time": "2024-11-11T01:07:43.400256",
"status": "completed"
},
"tags": []
},
"source": [
"## Data"
]
},
{
"cell_type": "markdown",
"id": "02c1cc46",
"metadata": {
"_cell_guid": "304718a6-ce41-4489-870f-cd7241f2ebdf",
"_uuid": "2872f338-182c-484e-9be4-c3d7322a3e40",
"collapsed": false,
"id": "TFfKvsudXDMI",
"jupyter": {
"outputs_hidden": false
},
"papermill": {
"duration": 0.018401,
"end_time": "2024-11-11T01:07:43.455689",
"exception": false,
"start_time": "2024-11-11T01:07:43.437288",
"status": "completed"
},
"tags": []
},
"source": [
"### Downloading"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "4d155cf0",
"metadata": {
"_cell_guid": "eb367fb7-e526-4d7d-8b1e-e576bdaa58a2",
"_uuid": "cd81eef9-90f5-466c-94dd-56c123e5ffda",
"collapsed": false,
"execution": {
"iopub.execute_input": "2024-11-11T01:07:43.494855Z",
"iopub.status.busy": "2024-11-11T01:07:43.494145Z",
"iopub.status.idle": "2024-11-11T01:07:57.356235Z",
"shell.execute_reply": "2024-11-11T01:07:57.355391Z"
},
"executionInfo": {
"elapsed": 6154,
"status": "ok",
"timestamp": 1731233861591,
"user": {
"displayName": "Abdelrhman Ashraf",
"userId": "11249532378747886614"
},
"user_tz": -120
},
"id": "y7jys3D2W2rf",
"jupyter": {
"outputs_hidden": false
},
"outputId": "7eaed1e1-0941-4575-8482-e7f8d328c531",
"papermill": {
"duration": 13.883923,
"end_time": "2024-11-11T01:07:57.358298",
"exception": false,
"start_time": "2024-11-11T01:07:43.474375",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset URL: https://www.kaggle.com/datasets/samirmoustafa/arabic-to-english-translation-sentences\r\n",
"License(s): copyright-authors\r\n",
"Archive: /kaggle/working/arabic-to-english-translation-sentences.zip\r\n",
" inflating: ara_eng.txt \r\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b5839b51ee1d4222b48ff200fa2aac6f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tatoeba_mt.py: 0%| | 0.00/15.5k [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "132bbe8e2c1e488ba4b8a148ba24f460",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"dataset_infos.json: 0%| | 0.00/1.96M [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "53ca5f02ab1d4a9c9b0a26f206959de2",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"README.md: 0%| | 0.00/12.1k [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d7945159d4774b6b8cb2554077b49b41",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tatoeba-test.ara-eng.tsv: 0%| | 0.00/938k [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "1a800a2434f6435e8bbeadbc58a75a6d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tatoeba-dev.ara-eng.tsv: 0%| | 0.00/1.78M [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e87fe085d45c4fa0a0ff2a7f9d3cbf0e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating test split: 0%| | 0/10304 [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "81a984dbbfa54e8b8f64fbae745422cb",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating validation split: 0%| | 0/19528 [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# !wget -q https://www.manythings.org/anki/ara-eng.zip\n",
"!kaggle datasets download -q samirmoustafa/arabic-to-english-translation-sentences\n",
"!unzip '{base_dir}/arabic-to-english-translation-sentences.zip'\n",
"\n",
"!wget -q -O ./tatoeba.tsv https://drive.google.com/uc?id=1aO0yDI4-rDxD5J0OYAgVlUUQuMuqtFSD\n",
"\n",
"hf_dataset = load_dataset('Helsinki-NLP/tatoeba_mt','ara-eng', trust_remote_code=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "b854edf1",
"metadata": {
"_cell_guid": "69daee9e-22ad-4562-9984-d49410ff0afc",
"_uuid": "05c81fe0-213d-4db9-a0c1-fe2cd66153e1",
"collapsed": false,
"execution": {
"iopub.execute_input": "2024-11-11T01:07:57.401994Z",
"iopub.status.busy": "2024-11-11T01:07:57.401118Z",
"iopub.status.idle": "2024-11-11T01:07:59.341331Z",
"shell.execute_reply": "2024-11-11T01:07:59.340234Z"
},
"executionInfo": {
"elapsed": 416,
"status": "ok",
"timestamp": 1731233862003,
"user": {
"displayName": "Abdelrhman Ashraf",
"userId": "11249532378747886614"
},
"user_tz": -120
},
"id": "6jEOnMR3LK4A",
"jupyter": {
"outputs_hidden": false
},
"papermill": {
"duration": 1.964721,
"end_time": "2024-11-11T01:07:59.344111",
"exception": false,
"start_time": "2024-11-11T01:07:57.379390",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"df_manythings = pd.read_csv(f'{base_dir}/ara_eng.txt', delimiter='\\t', names=['EN_sentence',\n",
" 'AR_sentence'])\n",
"\n",
"df_tatoeba = pd.read_csv(f'{base_dir}/tatoeba.tsv', delimiter='\\t', names=['EN_id',\n",
" 'EN_sentence',\n",
" 'AR_id',\n",
" 'AR_sentence']).drop(columns=['EN_id',\n",
" 'AR_id'])\n",
"\n",
"df_hf_1, df_hf_2 = hf_dataset['test'], hf_dataset['validation']\n",
"\n",
"df_hf_1 = pd.DataFrame(df_hf_1)[['sourceString','targetString']]\n",
"df_hf_1.columns = ['AR_sentence', 'EN_sentence']\n",
"\n",
"df_hf_2 = pd.DataFrame(df_hf_2)[['sourceString','targetString']]\n",
"df_hf_2.columns = ['AR_sentence', 'EN_sentence']\n",
"\n",
"df_data = pd.concat([df_manythings, df_tatoeba, df_hf_1, df_hf_2], axis=0, ignore_index=True)\n",
"df_data = df_data.reset_index(drop=True)"
]
},
{
"cell_type": "markdown",
"id": "1896c69b",
"metadata": {
"_cell_guid": "658f76c9-5ad7-44a3-82dc-b95d3941f7a6",
"_uuid": "b0f7587c-8168-4dc8-ae42-77d1a3d7d588",
"collapsed": false,
"id": "MeuhdoYUXyyE",
"jupyter": {
"outputs_hidden": false
},
"papermill": {
"duration": 0.019918,
"end_time": "2024-11-11T01:07:59.385541",
"exception": false,
"start_time": "2024-11-11T01:07:59.365623",
"status": "completed"
},
"tags": []
},
"source": [
"### Visualizing"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7ad2a644",
"metadata": {
"_cell_guid": "39366db9-8955-4c51-bc45-e2d32f4fca2c",
"_uuid": "999e8525-2c9b-4f46-8957-cab7543f7a3b",
"collapsed": false,
"execution": {
"iopub.execute_input": "2024-11-11T01:07:59.427717Z",
"iopub.status.busy": "2024-11-11T01:07:59.426958Z",
"iopub.status.idle": "2024-11-11T01:07:59.441030Z",
"shell.execute_reply": "2024-11-11T01:07:59.439963Z"
},
"executionInfo": {
"elapsed": 11,
"status": "ok",
"timestamp": 1731233862004,
"user": {
"displayName": "Abdelrhman Ashraf",
"userId": "11249532378747886614"
},
"user_tz": -120
},
"id": "pr4K4EuwW1hG",
"jupyter": {
"outputs_hidden": false
},
"outputId": "22f7f6eb-8f88-4462-c090-f3c6f854a98f",
"papermill": {
"duration": 0.037366,
"end_time": "2024-11-11T01:07:59.443057",
"exception": false,
"start_time": "2024-11-11T01:07:59.405691",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" EN_sentence | \n",
" AR_sentence | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Hi. | \n",
" مرحبًا. | \n",
"
\n",
" \n",
" 1 | \n",
" Run! | \n",
" اركض! | \n",
"
\n",
" \n",
" 2 | \n",
" Help! | \n",
" النجدة! | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" EN_sentence AR_sentence\n",
"0 Hi. مرحبًا.\n",
"1 Run! اركض!\n",
"2 Help! النجدة!"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_data.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "f9c198b5",
"metadata": {
"_cell_guid": "f4aa1236-9a0d-4404-a567-99b74ce5cd62",
"_uuid": "2bdd8195-553f-420f-bacd-34f14fc7a122",
"collapsed": false,
"execution": {
"iopub.execute_input": "2024-11-11T01:07:59.485375Z",
"iopub.status.busy": "2024-11-11T01:07:59.485061Z",
"iopub.status.idle": "2024-11-11T01:07:59.493835Z",
"shell.execute_reply": "2024-11-11T01:07:59.492990Z"
},
"executionInfo": {
"elapsed": 8,
"status": "ok",
"timestamp": 1731233862004,
"user": {
"displayName": "Abdelrhman Ashraf",
"userId": "11249532378747886614"
},
"user_tz": -120
},
"id": "Q6_5CpqeR1WZ",
"jupyter": {
"outputs_hidden": false
},
"outputId": "dafe43f3-f344-4d77-b589-e3dde59226e9",
"papermill": {
"duration": 0.031971,
"end_time": "2024-11-11T01:07:59.495776",
"exception": false,
"start_time": "2024-11-11T01:07:59.463805",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" EN_sentence | \n",
" AR_sentence | \n",
"
\n",
" \n",
" \n",
" \n",
" 101406 | \n",
" You'd better go. | \n",
" يستحسن انك تروح. | \n",
"
\n",
" \n",
" 101407 | \n",
" You should not talk here. | \n",
" يستحسن إنك ما تتكلمش هنا. | \n",
"
\n",
" \n",
" 101408 | \n",
" Make your choice. | \n",
" يلا اختار. | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" EN_sentence AR_sentence\n",
"101406 You'd better go. يستحسن انك تروح.\n",
"101407 You should not talk here. يستحسن إنك ما تتكلمش هنا.\n",
"101408 Make your choice. يلا اختار."
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_data.tail(3)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "ddd88e87",
"metadata": {
"_cell_guid": "82791955-ac05-4454-a04d-79ff1445e6b5",
"_uuid": "eb3335c0-e8c8-458d-97a9-cb5521fd4594",
"collapsed": false,
"execution": {
"iopub.execute_input": "2024-11-11T01:07:59.539311Z",
"iopub.status.busy": "2024-11-11T01:07:59.538539Z",
"iopub.status.idle": "2024-11-11T01:07:59.783825Z",
"shell.execute_reply": "2024-11-11T01:07:59.782983Z"
},
"executionInfo": {
"elapsed": 312,
"status": "ok",
"timestamp": 1731233862309,
"user": {
"displayName": "Abdelrhman Ashraf",
"userId": "11249532378747886614"
},
"user_tz": -120
},
"id": "lkehEZmETpS3",
"jupyter": {
"outputs_hidden": false
},
"papermill": {
"duration": 0.269515,
"end_time": "2024-11-11T01:07:59.786005",
"exception": false,
"start_time": "2024-11-11T01:07:59.516490",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"df_data['EN_sentence_length'] = df_data['EN_sentence'].apply(lambda x: len(x.split(' ')))\n",
"df_data['AR_sentence_length'] = df_data['AR_sentence'].apply(lambda x: len(x.split(' ')))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "c2512e94",
"metadata": {
"_cell_guid": "2ce3e90d-84aa-4528-b3b8-ebb69e124bee",
"_uuid": "c0db62ea-a6e2-4884-8cb9-459b90c5d335",
"collapsed": false,
"execution": {
"iopub.execute_input": "2024-11-11T01:07:59.829149Z",
"iopub.status.busy": "2024-11-11T01:07:59.828349Z",
"iopub.status.idle": "2024-11-11T01:08:00.536692Z",
"shell.execute_reply": "2024-11-11T01:08:00.535734Z"
},
"executionInfo": {
"elapsed": 922,
"status": "ok",
"timestamp": 1731233863229,
"user": {
"displayName": "Abdelrhman Ashraf",
"userId": "11249532378747886614"
},
"user_tz": -120
},
"id": "yYMZjHJsaUBK",
"jupyter": {
"outputs_hidden": false
},
"outputId": "0ba341d8-f865-496b-9920-fb489d215d61",
"papermill": {
"duration": 0.732373,
"end_time": "2024-11-11T01:08:00.539150",
"exception": false,
"start_time": "2024-11-11T01:07:59.806777",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Create a figure and two subplots\n",
"fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n",
"\n",
"# Define custom tick range based on your data range\n",
"EN_x_ticks = np.arange(0, df_data['EN_sentence_length'].max()+1, 25)\n",
"AR_x_ticks = np.arange(0, df_data['AR_sentence_length'].max()+1, 25)\n",
"\n",
"# Plot histogram for EN_sentence_length\n",
"axes[0].hist(df_data['EN_sentence_length'], bins=50, color='skyblue', edgecolor='black')\n",
"axes[0].set_title('EN Sentence Length')\n",
"axes[0].set_xlabel('Length')\n",
"axes[0].set_ylabel('Frequency')\n",
"axes[0].set_xticks(EN_x_ticks) # Add more x-axis ticks\n",
"\n",
"# Plot histogram for AR_sentence_length\n",
"axes[1].hist(df_data['AR_sentence_length'], bins=50, color='salmon', edgecolor='black')\n",
"axes[1].set_title('AR Sentence Length')\n",
"axes[1].set_xlabel('Length')\n",
"axes[1].set_ylabel('Frequency')\n",
"axes[1].set_xticks(AR_x_ticks) # Add more x-axis ticks\n",
"\n",
"# Display the plots\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "d9a20ed6",
"metadata": {
"_cell_guid": "93840f23-72fc-4a14-afab-a7d0bb3cbdec",
"_uuid": "e1d2150a-bbdd-4a97-987f-3c5c348cf946",
"collapsed": false,
"id": "JnYOEPnJSK1I",
"jupyter": {
"outputs_hidden": false
},
"papermill": {
"duration": 0.02095,
"end_time": "2024-11-11T01:08:00.581884",
"exception": false,
"start_time": "2024-11-11T01:08:00.560934",
"status": "completed"
},
"tags": []
},
"source": [
"As we see there are long and too short sentences.\n",
"\n",
"Short sentences will suffer from vanishing Gradients, As we will do post-padding (right-padding), so we will dorp short sentences."
]
},
{
"cell_type": "markdown",
"id": "016a024c",
"metadata": {
"_cell_guid": "5401299b-ecd8-4433-8818-2dde942b8f1c",
"_uuid": "7993eb62-7c1c-4400-a2cc-488985cda970",
"collapsed": false,
"id": "yk9hgqksU9_W",
"jupyter": {
"outputs_hidden": false
},
"papermill": {
"duration": 0.020897,
"end_time": "2024-11-11T01:08:00.623676",
"exception": false,
"start_time": "2024-11-11T01:08:00.602779",
"status": "completed"
},
"tags": []
},
"source": [
"### Cleaning"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "32c51041",
"metadata": {
"_cell_guid": "de16e1a4-e20e-4ec4-b24f-b05db3f151dd",
"_uuid": "c68e2832-2b45-4638-96d4-9bbd6c10231c",
"collapsed": false,
"execution": {
"iopub.execute_input": "2024-11-11T01:08:00.667588Z",
"iopub.status.busy": "2024-11-11T01:08:00.666834Z",
"iopub.status.idle": "2024-11-11T01:08:00.724816Z",
"shell.execute_reply": "2024-11-11T01:08:00.723725Z"
},
"executionInfo": {
"elapsed": 9,
"status": "ok",
"timestamp": 1731233863229,
"user": {
"displayName": "Abdelrhman Ashraf",
"userId": "11249532378747886614"
},
"user_tz": -120
},
"id": "D7-HJg_PVBWV",
"jupyter": {
"outputs_hidden": false
},
"outputId": "882a3d1e-ea75-4afb-a6ff-e976d037e084",
"papermill": {
"duration": 0.08227,
"end_time": "2024-11-11T01:08:00.726872",
"exception": false,
"start_time": "2024-11-11T01:08:00.644602",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 101409 entries, 0 to 101408\n",
"Data columns (total 4 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 EN_sentence 101409 non-null object\n",
" 1 AR_sentence 101409 non-null object\n",
" 2 EN_sentence_length 101409 non-null int64 \n",
" 3 AR_sentence_length 101409 non-null int64 \n",
"dtypes: int64(2), object(2)\n",
"memory usage: 3.1+ MB\n",
"None\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" EN_sentence_length | \n",
" AR_sentence_length | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 101409.000000 | \n",
" 101409.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 8.823053 | \n",
" 7.350225 | \n",
"
\n",
" \n",
" std | \n",
" 11.487242 | \n",
" 10.119463 | \n",
"
\n",
" \n",
" min | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 4.000000 | \n",
" 3.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 6.000000 | \n",
" 5.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 8.000000 | \n",
" 7.000000 | \n",
"
\n",
" \n",
" max | \n",
" 225.000000 | \n",
" 225.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" EN_sentence_length AR_sentence_length\n",
"count 101409.000000 101409.000000\n",
"mean 8.823053 7.350225\n",
"std 11.487242 10.119463\n",
"min 1.000000 1.000000\n",
"25% 4.000000 3.000000\n",
"50% 6.000000 5.000000\n",
"75% 8.000000 7.000000\n",
"max 225.000000 225.000000"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(df_data.info())\n",
"df_data.describe()\n",
"## so Q3 at 8"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "dbf84937",
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-11T01:08:00.771935Z",
"iopub.status.busy": "2024-11-11T01:08:00.771264Z",
"iopub.status.idle": "2024-11-11T01:08:00.779175Z",
"shell.execute_reply": "2024-11-11T01:08:00.778295Z"
},
"papermill": {
"duration": 0.032529,
"end_time": "2024-11-11T01:08:00.781173",
"exception": false,
"start_time": "2024-11-11T01:08:00.748644",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"# https://stackoverflow.com/a/518232/2809427\n",
"def unicodeToAscii(s):\n",
" return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')\n",
"\n",
"def preprocess_ar(text):\n",
" text = araby.strip_diacritics(text).strip() # Remove diacritics \"التشكيل\"\n",
" text = re.sub(r'[a-zA-Z]', '', text) # Remove English letters\n",
" text = re.sub(r'\\s+', ' ', text).strip() # Trim multiple whitespaces to one\n",
" text = re.sub(r'[_|\\d+|\\\\|\\-|؛|،|,|\\[|\\]|\\(|\\)|\\\"|/|%|!|,|.|:|♪|«|»|}|{|*|#]+', '', text) # Remove special characters and digits\n",
" text = unicodeToAscii(text)\n",
" return text\n",
"\n",
"def preprocess_en(text):\n",
" text = text.lower()\n",
" text = contractions.fix(text) # Fix contractions \"it's\" -> \"it is\"\n",
" text = re.sub(r'[\\u0600-\\u06FF]', '', text) # Remove Arabic letters\n",
" text = re.sub(r'\\s+', ' ', text).strip() # Trim multiple whitespaces to one\n",
" text = re.sub(r'[_|\\d+|\\\\|\\-|؛|،|,|\\[|\\]|\\(|\\)|\\\"|/|%|!|,|.|:|♪|«|»|}|{|*|#]+', '', text) # Remove special characters and digits\n",
" text = unicodeToAscii(text)\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "65970704",
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-11T01:08:00.826388Z",
"iopub.status.busy": "2024-11-11T01:08:00.825773Z",
"iopub.status.idle": "2024-11-11T01:08:07.840714Z",
"shell.execute_reply": "2024-11-11T01:08:07.839840Z"
},
"papermill": {
"duration": 7.040131,
"end_time": "2024-11-11T01:08:07.842951",
"exception": false,
"start_time": "2024-11-11T01:08:00.802820",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"df_data['EN_sentence'] = df_data['EN_sentence'].apply(preprocess_en)\n",
"df_data['AR_sentence'] = df_data['AR_sentence'].apply(preprocess_ar)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "975eb354",
"metadata": {
"_cell_guid": "504b87d4-4ae2-4546-b20d-bd8a5a11a003",
"_uuid": "048c8e25-6fc4-4724-ba6d-0ec33961d904",
"collapsed": false,
"execution": {
"iopub.execute_input": "2024-11-11T01:08:07.888010Z",
"iopub.status.busy": "2024-11-11T01:08:07.887209Z",
"iopub.status.idle": "2024-11-11T01:08:07.896071Z",
"shell.execute_reply": "2024-11-11T01:08:07.895194Z"
},
"executionInfo": {
"elapsed": 8,
"status": "ok",
"timestamp": 1731233863230,
"user": {
"displayName": "Abdelrhman Ashraf",
"userId": "11249532378747886614"
},
"user_tz": -120
},
"id": "E8ApzqfrVHw9",
"jupyter": {
"outputs_hidden": false
},
"outputId": "9e86fcd0-9677-4d6f-ef58-5d347f82f4be",
"papermill": {
"duration": 0.033156,
"end_time": "2024-11-11T01:08:07.897900",
"exception": false,
"start_time": "2024-11-11T01:08:07.864744",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"(101409, 7383, 3334)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"arr_len = df_data['EN_sentence_length']\n",
"len(arr_len), len(arr_len[arr_len>20]), len(arr_len[arr_len<3])"
]
},
{
"cell_type": "markdown",
"id": "141b4830",
"metadata": {
"_cell_guid": "dbce33c5-d4b9-4569-bc38-057f5714efdc",
"_uuid": "19ce86d4-32d0-43ba-bac0-d5cc75013a9b",
"collapsed": false,
"id": "qHIb_lTkXmd_",
"jupyter": {
"outputs_hidden": false
},
"papermill": {
"duration": 0.022359,
"end_time": "2024-11-11T01:08:07.941762",
"exception": false,
"start_time": "2024-11-11T01:08:07.919403",
"status": "completed"
},
"tags": []
},
"source": [
"We will drop sentences that > 20 words or < 3 words for source language."
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "77fa809d",
"metadata": {
"_cell_guid": "6b37401f-f687-4f7c-aca0-c9acd2bf4664",
"_uuid": "4ac3524d-109b-4e79-a6b2-8f63d3643f9f",
"collapsed": false,
"execution": {
"iopub.execute_input": "2024-11-11T01:08:07.987137Z",
"iopub.status.busy": "2024-11-11T01:08:07.986287Z",
"iopub.status.idle": "2024-11-11T01:08:08.112139Z",
"shell.execute_reply": "2024-11-11T01:08:08.111095Z"
},
"executionInfo": {
"elapsed": 6,
"status": "ok",
"timestamp": 1731233863230,
"user": {
"displayName": "Abdelrhman Ashraf",
"userId": "11249532378747886614"
},
"user_tz": -120
},
"id": "1XTCKT3nXxOD",
"jupyter": {
"outputs_hidden": false
},
"papermill": {
"duration": 0.150953,
"end_time": "2024-11-11T01:08:08.114259",
"exception": false,
"start_time": "2024-11-11T01:08:07.963306",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"EN_sentence 0\n",
"AR_sentence 0\n",
"EN_sentence_length 0\n",
"AR_sentence_length 0\n",
"dtype: int64"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_data = df_data[(df_data['EN_sentence_length'] <= 20)]\n",
"df_data = df_data[(df_data['EN_sentence_length'] >= 3)]\n",
"\n",
"df_data = df_data[(df_data['AR_sentence_length'] <= 20)]\n",
"df_data = df_data[(df_data['AR_sentence_length'] >= 3)]\n",
"\n",
"df_data = df_data.drop_duplicates(keep='first', subset='AR_sentence')\n",
"df_data = df_data.drop_duplicates(keep='first', subset='EN_sentence')\n",
"\n",
"df_data = df_data.replace('', pd.NA).dropna()\n",
"df_data = df_data.replace(' ', pd.NA).dropna()\n",
"df_data.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "8bcf0f96",
"metadata": {
"_cell_guid": "e6d0a9ee-0f33-4fd3-9d10-3b26815ec3fe",
"_uuid": "06c813f4-6881-4566-a2df-485bdfac9520",
"collapsed": false,
"execution": {
"iopub.execute_input": "2024-11-11T01:08:08.159726Z",
"iopub.status.busy": "2024-11-11T01:08:08.159108Z",
"iopub.status.idle": "2024-11-11T01:08:08.841666Z",
"shell.execute_reply": "2024-11-11T01:08:08.840716Z"
},
"executionInfo": {
"elapsed": 759,
"status": "ok",
"timestamp": 1731233863984,
"user": {
"displayName": "Abdelrhman Ashraf",
"userId": "11249532378747886614"
},
"user_tz": -120
},
"id": "JYQ1JaoEfw9F",
"jupyter": {
"outputs_hidden": false
},
"outputId": "4c7856f2-a45b-4a0b-b5a2-ccdbc377cb25",
"papermill": {
"duration": 0.707615,
"end_time": "2024-11-11T01:08:08.844017",
"exception": false,
"start_time": "2024-11-11T01:08:08.136402",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"