Custom Processing Units

The Snips NLU library provides a default NLU pipeline containing built-in processing units such as the DeterministicIntentParser or the ProbabilisticIntentParser.

However, it is possible to define custom processing units and use them in a SnipsNLUEngine.

The main processing unit of the Snips NLU processing pipeline is the SnipsNLUEngine. This engine relies on a list of IntentParser that are called successively until one of them manages to extract an intent. By default, two parsers are used by the engine: a DeterministicIntentParser and a ProbabilisticIntentParser.

Let’s focus on the probabilistic intent parser. This parser parses text using two steps: first it classifies the intent using an IntentClassifier and once the intent is known, it using a SlotFiller in order to extract the slots.

For the purpose of this tutorial, let’s build a custom alternative to the CRFSlotFiller which is the default slot filler used by the probabilistic intent parser.

Our custom slot filler will extract slots by relying on a very simple and naive keyword matching logic:

import json

from snips_nlu.common.utils import json_string
from snips_nlu.preprocessing import tokenize
from snips_nlu.result import unresolved_slot
from snips_nlu.slot_filler import SlotFiller


@SlotFiller.register("keyword_slot_filler")
class KeywordSlotFiller(SlotFiller):
    def __init__(self, config=None, **shared):
        super(KeywordSlotFiller, self).__init__(config, **shared)
        self.slots_keywords = None
        self.language = None

    @property
    def fitted(self):
        return self.slots_keywords is not None

    def fit(self, dataset, intent):
        self.language = dataset["language"]
        self.slots_keywords = dict()
        utterances = dataset["intents"][intent]["utterances"]
        for utterance in utterances:
            for chunk in utterance["data"]:
                if "slot_name" in chunk:
                    text = chunk["text"]
                    self.slots_keywords[text] = [
                        chunk["entity"],
                        chunk["slot_name"]
                    ]
        return self

    def get_slots(self, text):
        tokens = tokenize(text, self.language)
        slots = []
        for token in tokens:
            value = token.value
            if value in self.slots_keywords:
                entity = self.slots_keywords[value][0]
                slot_name = self.slots_keywords[value][1]
                slot = unresolved_slot((token.start, token.end), value,
                                       entity, slot_name)
                slots.append(slot)
        return slots

    def persist(self, path):
        model = {
            "language": self.language,
            "slots_keywords": self.slots_keywords,
            "config": self.config.to_dict()
        }
        with path.open(mode="w") as f:
            f.write(json_string(model))

    @classmethod
    def from_path(cls, path, **shared):
        with path.open() as f:
            model = json.load(f)
        slot_filler = cls()
        slot_filler.language = model["language"]
        slot_filler.slots_keywords = model["slots_keywords"]
        slot_filler.config = cls.config_type.from_dict(model["config"])
        return slot_filler

Our custom slot filler is registered to the list of available processing units by the use of a class decorator: @SlotFiller.register("keyword_slot_filler").

Now that we have created our keyword slot filler, we can create a specific NLUEngineConfig which will make use of it:

from snips_nlu import SnipsNLUEngine
from snips_nlu.pipeline.configs import (
    ProbabilisticIntentParserConfig, NLUEngineConfig)
from snips_nlu.slot_filler.keyword_slot_filler import KeywordSlotFiller

slot_filler_config = KeywordSlotFiller.default_config()
parser_config = ProbabilisticIntentParserConfig(
    slot_filler_config=slot_filler_config)
engine_config = NLUEngineConfig([parser_config])
nlu_engine = SnipsNLUEngine(engine_config)

Custom processing unit configuration

So far, our keyword slot filler is very simple, especially because it is not configurable.

Now, let’s imagine that we would like to perform a normalization step before matching keywords, which would consist in lowercasing the values. We could hardcode this behavior in our unit, but what we rather want is a way to configure this behavior. This can be done through the use of the config attribute of our keyword slot filler. Let’s add a boolean parameter in the config, so that now our KeywordSlotFiller implementation looks like this:

import json

from snips_nlu.common.utils import json_string
from snips_nlu.preprocessing import tokenize
from snips_nlu.result import unresolved_slot
from snips_nlu.slot_filler import SlotFiller


@SlotFiller.register("keyword_slot_filler")
class KeywordSlotFiller(SlotFiller):
    def __init__(self, config=None, **shared):
        super(KeywordSlotFiller, self).__init__(config, **shared)
        self.slots_keywords = None
        self.language = None

    @property
    def fitted(self):
        return self.slots_keywords is not None

    def fit(self, dataset, intent):
        self.language = dataset["language"]
        self.slots_keywords = dict()
        utterances = dataset["intents"][intent]["utterances"]
        for utterance in utterances:
            for chunk in utterance["data"]:
                if "slot_name" in chunk:
                    text = chunk["text"]
                    if self.config.get("lowercase", False):
                        text = text.lower()
                    self.slots_keywords[text] = [
                        chunk["entity"],
                        chunk["slot_name"]
                    ]
        return self

    def get_slots(self, text):
        tokens = tokenize(text, self.language)
        slots = []
        for token in tokens:
            normalized_value = token.value
            if self.config.get("lowercase", False):
                normalized_value = normalized_value.lower()
            if normalized_value in self.slots_keywords:
                entity = self.slots_keywords[normalized_value][0]
                slot_name = self.slots_keywords[normalized_value][1]
                slot = unresolved_slot((token.start, token.end), token.value,
                                       entity, slot_name)
                slots.append(slot)
        return slots

    def persist(self, path):
        model = {
            "language": self.language,
            "slots_keywords": self.slots_keywords,
            "config": self.config.to_dict()
        }
        with path.open(mode="w") as f:
            f.write(json_string(model))

    @classmethod
    def from_path(cls, path, **shared):
        with path.open() as f:
            model = json.load(f)
        slot_filler = cls()
        slot_filler.language = model["language"]
        slot_filler.slots_keywords = model["slots_keywords"]
        slot_filler.config = cls.config_type.from_dict(model["config"])
        return slot_filler

With this updated implementation, we can now define a more specific configuration for our slot filler:

from snips_nlu import SnipsNLUEngine
from snips_nlu.pipeline.configs import (
    ProbabilisticIntentParserConfig, NLUEngineConfig)
from snips_nlu.slot_filler.keyword_slot_filler import KeywordSlotFiller

slot_filler_config = {
    "unit_name": "keyword_slot_filler",  # required in order to identify the processing unit
    "lower_case": True
}
parser_config = ProbabilisticIntentParserConfig(
    slot_filler_config=slot_filler_config)
engine_config = NLUEngineConfig([parser_config])
nlu_engine = SnipsNLUEngine(engine_config)

You can now train this engine, parse intents, persist it and load it from disk.

Note

The client code is responsible for persisting and loading the unit configuration as done in the implementation example. This will ensure that the proper configuration is used when deserializing the processing unit.