IP anonymization

Hi folks,

I have Graylog running and analyzing logs from different input sources (beats & gelf).
Due to GDPR reasons I need to make sure IPs are anonymized.
Currently I use this Plugin https://github.com/graylog-labs/graylog-plugin-ipanonymizer, which replaces the 4th octet of IPv4 with xxx.
Unfortunately this project is not continued and references to the use of Pipelines.

Is it possible to filter all kind of IPv4 and IPv6 addresses in a similar way that plugin does it, by removing parts of the address before storing it in the MongoDB backend?

IPV6: ((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?

IPV4: (?<![0-9])(?:(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5]))(?![0-9])

I tried out the regex in the extractor section, but there I could just replace the whole IP address but not just a part of it.
It should be possible, that the regex is applied multiple times on the message filed, because it can happen, that IPs are sent at different positions at the message.

Is the usage of a Pipeline the right way to achieve it and ho exactly would I need to formulate a rule to do this?

Yes, pipeline function is ideal solutions:

For IPv4, this pipeline function replace last octet with xxx on field message:

rule "anonymize IPv4"
when
   has_field("message")
then
      let anon_ip = regex_replace(pattern: "(?<![0-9])(?:([0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.]([0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.]([0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.]([0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5]))(?![0-9])",
        value: to_string($message.message),
        replacement: "$1.$2.$3.xxx"
    );
    set_field("message", anon_ip);
end

Hi @shoothub
thanks for the example.
It works the way I was expectiong it to work!

The problem was, I had to upgrade my Graylog first from 2.4.6 to 3.3.3 because this “regex_replace” function has been added at Graylog 3.0.0 …
It was a bit annoying, because with this update also a few coinfig settings have changed and I needed to modify the start ENV parameters for my Docker container but now it does, what it should.

I will now try the same for IPv6. When I have a working Rule I will post it here.

Best regards!

Edit:

Graylog had some issues with the IPv6 patterns that I wanted to use and because I already replace IPv4 I do not need this part of the regex for IPv6

rule "anonymize IPv6 v1"
when
  has_field("message")
then
  let anonym_ip = regex_replace(
    pattern: "([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}|:)",
    value: to_string($message.message),
    replacement: "$1:$2:yyy:yyy:yyy:yyy:yyy::"
  );
  set_field("message", anonym_ip);
end

rule "anonymize IPv6 v2"
when
  has_field("message")
then
  let anonym_ip = regex_replace(
    pattern: "([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):(:[0-9A-Fa-f]{1,4}|:)",
    value: to_string($message.message),
    replacement: "$1:$2:yyy:yyy:yyy:yyy::"
  );
  set_field("message", anonym_ip);
end

rule "anonymize IPv6 v3"
when
  has_field("message")
then
  let anonym_ip = regex_replace(
    pattern: "([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):(?::([0-9A-Fa-f]{1,4})(?::([0-9A-Fa-f]{1,4}))?|:)",
    value: to_string($message.message),
    replacement: "$1:$2:yyy:yyy:yyy::"
  );
  set_field("message", anonym_ip);
end

rule "anonymize IPv6 v4"
when
  has_field("message")
then
  let anonym_ip = regex_replace(
    pattern: "([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):(?::([0-9A-Fa-f]{1,4})(?::([0-9A-Fa-f]{1,4}))?(?::([0-9A-Fa-f]{1,4}))?|:)",
    value: to_string($message.message),
    replacement: "$1:$2:yyy:yyy::"
  );
  set_field("message", anonym_ip);
end

rule "anonymize IPv6 v5"
when
  has_field("message")
then
  let anonym_ip = regex_replace(
    pattern: "([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):(?::([0-9A-Fa-f]{1,4})(?::([0-9A-Fa-f]{1,4}))?(?::([0-9A-Fa-f]{1,4}))?(?::([0-9A-Fa-f]{1,4}))?|:)",
    value: to_string($message.message),
    replacement: "$1:yyy:yyy::"
  );
  set_field("message", anonym_ip);
end

rule "anonymize IPv6 v6"
when
  has_field("message")
then
  let anonym_ip = regex_replace(
    pattern: "([0-9A-Fa-f]{1,4}):([0-9A-Fa-f]{1,4}):(?::([0-9A-Fa-f]{1,4})(?::([0-9A-Fa-f]{1,4}))?(?::([0-9A-Fa-f]{1,4}))?(?::([0-9A-Fa-f]{1,4}))?(?::([0-9A-Fa-f]{1,4}))?|:)",
    value: to_string($message.message),
    replacement: "$1:yyy::"
  );
  set_field("message", anonym_ip);
end

rule "anonymize IPv6 v7"
when
  has_field("message")
then
  let anonym_ip = regex_replace(
    pattern: "([0-9A-Fa-f]{1,4}):(?::([0-9A-Fa-f]{1,4})(?::([0-9A-Fa-f]{1,4}))?(?::([0-9A-Fa-f]{1,4}))?(?::([0-9A-Fa-f]{1,4}))?(?::([0-9A-Fa-f]{1,4}))?(?::([0-9A-Fa-f]{1,4}))?|:)",
    value: to_string($message.message),
    replacement: "yyy::"
  );
  set_field("message", anonym_ip);
end

rule "anonymize IPv6 v8"
when
  has_field("message")
then
  let anonym_ip = regex_replace(
    pattern: "::([0-9A-Fa-f]{1,4})(?::([0-9A-Fa-f]{1,4}))?(?::([0-9A-Fa-f]{1,4}))?(?::([0-9A-Fa-f]{1,4}))?(?::([0-9A-Fa-f]{1,4}))?(?::([0-9A-Fa-f]{1,4}))?(?::([0-9A-Fa-f]{1,4}))?",
    value: to_string($message.message),
    replacement: "::yyy"
  );
  set_field("message", anonym_ip);
end

With all those 9 Rules I filled a Pipeline which I applied to all Streams. Now everything is anonymized