const MONOLINGUAL = [
  {
    number: "01",
    title: "data crawling",
    text: "Data gets <a class='hyperlink' target='_blank' href='https://github.com/macocu/MaCoCu-crawler'>collected</a> in the targeted languages from top level domains and related websites. Large quantities of raw data are gathered and stored in a <a class='hyperlink' target='_blank' href='https://www.sketchengine.eu/my_keywords/prevertical/'>prevertical</a> format.",
    type: "mono",
    key: 1,
  },
  {
    number: "02",
    title: "preproccessing",
    text: "Relevant data is extracted from <a class='hyperlink' target='_blank' href='https://www.sketchengine.eu/my_keywords/prevertical/'>prevertical</a> files following a standardization process: plain text extraction, <a class='hyperlink' target='_blank' href='https://corpus.tools/wiki/Justext'>boilerplate</a>, <a class='hyperlink' target='_blank' href='https://corpus.tools/wiki/Onion'>near-duplicated paragraphs</a> and very short texts removal, language identification, encoding normalization, document and paragraph identification. Duplicate documents are deleted. Manual checks over the largest domains are applied to spot and delete very bad domains (e.g. machine translated).",
    type: "mono",
    key: 2,
  },
  {
    number: "03",
    title: "Scoring for filtering",
    text: "Monocleaner is then applied to score texts for fluency (score between 0 and 1) using a language model, double check language, and to further remove duplicates at the desired level (document, paragraph, or sentences).",
    type: "mono",
    key: 3,
  },
  {
    number: "04",
    title: "Enrichment and formatting",
    text: "Data is enriched with further info such as <a class='hyperlink' target='_blank' href='https://github.com/macocu/American-British-variety-classifier'>language variety</a>, <a class='hyperlink' target='_blank' href='https://github.com/bitextor/biroamer'>personal/sensitive information</a>, domain (<a class='hyperlink' target='_blank' href='https://github.com/macocu/DSI'>DSIs</a>), paragraph quality (labels, such as “short” or “good”, assigned based on paragraph length, URL and stopword density via the jusText tool). Data is converted to XML (and sometimes TSV) formats.",
    type: "mono",
    key: 4,
  },
];

const PARALLEL = [
  {
    number: "01",
    title: "data crawling",
    text: " Data gets <a class='hyperlink' target='_blank' href='https://github.com/macocu/MaCoCu-crawler'>collected</a> in the targeted languages from top level domains and related websites. Large quantities of raw data are gathered and stored in a <a class='hyperlink'  target='_blank' href='https://www.sketchengine.eu/my_keywords/prevertical/'>prevertical</a> format.",
    key: 5,
  },
  {
    number: "02",
    title: "Pre-processing",
    text: "Relevant data is extracted from <a class='hyperlink' target='_blank' href='https://www.sketchengine.eu/my_keywords/prevertical/'>prevertical</a> files following a standardization process: plain text extraction, <a class='hyperlink' target='_blank' href='https://corpus.tools/wiki/Justext'>boilerplate</a>, <a class='hyperlink' target='_blank' href='https://corpus.tools/wiki/Onion'>near-duplicated paragraphs</a> and very short texts removal, language identification, encoding normalization, document and paragraph identification and sentence spliting. Duplicate documents are deleted.",
    type: "parallel",
    key: 6,
  },
  {
    number: "03",
    title: "document alignment",
    text: "First step for the identification of parallel data.  TF/IDF + machine translation is used to identify parallel documents.",
    type: "parallel",
    key: 7,
  },
  {
    number: "04",
    title: "sentence alignment",
    text: "The Bleualing technique, using neural machine translation, is applied to identify parallel sentences.",
    type: "parallel",
    key: 8,
  },
  {
    number: "05",
    title: "fixing, filtering and scoring",
    text: "Noisy bilingual sentences are automatically fixed or filtered. Repeated segments are deduplicated.",
    type: "parallel",
    key: 9,
  },
  {
    number: "06",
    title: "enrichment and formatting",
    text: "Data is enriched with further info such as <a class='hyperlink' target='_blank' href='https://github.com/macocu/American-British-variety-classifier'>language variety</a>, <a class='hyperlink' target='_blank' href='https://github.com/bitextor/biroamer'>personal/sensitive information</a>, domain (<a class='hyperlink' target='_blank' href='https://github.com/macocu/DSI'>DSIs</a>), translationese or human/machine translation identification. Other metadata might be included. Data is converted to TXT and TMX formats.",
    type: "parallel",
    key: 10,
  },
];

export { MONOLINGUAL, PARALLEL };
