{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# StringSplitter\n", "\n", "This presentation's goal it to introduce the features of the `StringSplitter` and how to configure it." ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### The challenges\n", "\n", "- I want to split strings of varying length contained in a source field\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "given preprocessed log entry:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "document = {\n", " \"ip_addresses\": \"192.168.5.1, 10.10.2.1, fe80::, 127.0.0.1\"\n", "}\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create rules and processor" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "create the rules:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'logprep.processor.string_splitter'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[10], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39msys\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[39m# sys.path.append(\"../../../../../\")\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mlogprep\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mprocessor\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mstring_splitter\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mrule\u001b[39;00m \u001b[39mimport\u001b[39;00m StringSplitterRule\n\u001b[1;32m 5\u001b[0m rules_definitions \u001b[39m=\u001b[39m [\n\u001b[1;32m 6\u001b[0m {\n\u001b[1;32m 7\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mfilter\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mip_addresses\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 12\u001b[0m }\n\u001b[1;32m 13\u001b[0m ]\n\u001b[1;32m 14\u001b[0m rules \u001b[39m=\u001b[39m [StringSplitterRule\u001b[39m.\u001b[39m_create_from_dict(rule_dict) \u001b[39mfor\u001b[39;00m rule_dict \u001b[39min\u001b[39;00m rules_definitions]\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'logprep.processor.string_splitter'" ] } ], "source": [ "import sys\n", "sys.path.append(\"../../../../../\")\n", "\n", "from logprep.processor.string_splitter.rule import StringSplitterRule\n", "rules_definitions = [\n", " {\n", " \"filter\": \"ip_addresses\",\n", " \"string_splitter\": {\n", " \"source_fields\": [\"ip_addresses\"],\n", " \"target_field\": \"ip_address_list\"\n", " },\n", " }\n", "]\n", "rules = [StringSplitterRule.create_from_dict(rule_dict) for rule_dict in rules_definitions]\n", "rules" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "create the processor config:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "processor_config = {\n", " \"allmighty_string_splitter\": {\n", " \"type\": \"string_splitter\",\n", " \"rules\": [\"/dev\"],\n", " }\n", "}\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "create the processor with the factory:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "string_splitter" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from logging import getLogger\n", "from logprep.factory import Factory\n", "\n", "logger = getLogger()\n", "\n", "processor = Factory.create(processor_config)\n", "processor\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "load rules to processor" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[filter=\"ip_addresses\", StringSplitterRule.Config(description='', regex_fields=[], tests=[], tag_on_failure=['_string_splitter_failure'], source_fields=['ip_addresses'], target_field='ip_addresses', delete_source_fields=False, overwrite_target=True, extend_target_list=False, delimeter=' ')]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for rule in rules:\n", " processor._rule_tree.add_rule(rule)\n", " \n", "processor.rules" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Process event" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from copy import deepcopy\n", "\n", "mydocument = deepcopy(document)\n", "processor.process(mydocument)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check Results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'ip_addresses': '192.168.5.1, 10.10.2.1, fe80::, 127.0.0.1'}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "document" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'ip_addresses': ['192.168.5.1,', '10.10.2.1,', 'fe80::,', '127.0.0.1']}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mydocument" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.11.0 ('.venv': venv)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.1" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "586280540a85d3e21edc698fe7b86af2848b9b02644e6c22463da25c40a3f1be" } } }, "nbformat": 4, "nbformat_minor": 2 }