Location based document tagging

Location based document tagging

The issue is to find document mentioning location and especially vocabulary related to geolocation.

The discussions was around a fast way to lookup for location terminology and store those keywords in BloomFilters.

Bloom Filter: use poppy GitHub - hashlookup/poppy: Rust implementation of the DCSO Bloom filter ยท GitHub
python: poppy-py

1 Like

Work in progress repository:

curl -sG   --data-urlencode "topic=luxembourg"   http://127.0.0.1:5000/api/query | jq 
{
  "results": {
    "combined": true,
    "country/ar": false,
    "country/bn": false,
    "country/de": false,
    "country/en": true,
    "country/es": false,
    "country/fa": false,
    "country/fr": true,
    "country/hi": false,
    "country/id": false,
    "country/it": false,
    "country/ja": false,
    "country/ko": false,
    "country/mr": false,
    "country/pt": false,
    "country/ru": false,
    "country/sw": true,
    "country/ta": false,
    "country/te": false,
    "country/tr": false,
    "country/ur": false,
    "country/vi": true,
    "country/zh": false,
    "location/ar": false,
    "location/bn": false,
    "location/de": false,
    "location/en": false,
    "location/es": false,
    "location/fa": false,
    "location/fr": false,
    "location/hi": false,
    "location/id": false,
    "location/it": false,
    "location/ja": false,
    "location/ko": false,
    "location/mr": false,
    "location/pt": false,
    "location/ru": false,
    "location/sw": false,
    "location/ta": false,
    "location/te": false,
    "location/tr": false,
    "location/ur": false,
    "location/vi": false,
    "location/zh": false
  },
  "topic": "luxembourg"
}

and for a full-text

 curl -s -X POST   -H "Content-Type: application/json"   -d '{"text":"Paris is in France and Paris has cafes. Rue de la... Adress... pobox... addresse avenue bricolage rue rue rue rue belgique luxembourg","top_n":5}'   http://127.0.0.1:5000/api/query-text | jq
{
  "analyzed_filters": [
    "combined",
    "country/ar",
    "country/bn",
    "country/de",
    "country/en",
    "country/es",
    "country/fa",
    "country/fr",
    "country/hi",
    "country/id",
    "country/it",
    "country/ja",
    "country/ko",
    "country/mr",
    "country/pt",
    "country/ru",
    "country/sw",
    "country/ta",
    "country/te",
    "country/tr",
    "country/ur",
    "country/vi",
    "country/zh",
    "location/ar",
    "location/bn",
    "location/de",
    "location/en",
    "location/es",
    "location/fa",
    "location/fr",
    "location/hi",
    "location/id",
    "location/it",
    "location/ja",
    "location/ko",
    "location/mr",
    "location/pt",
    "location/ru",
    "location/sw",
    "location/ta",
    "location/te",
    "location/tr",
    "location/ur",
    "location/vi",
    "location/zh"
  ],
  "filter_counts": {
    "combined": 11,
    "country/ar": 0,
    "country/bn": 0,
    "country/de": 0,
    "country/en": 2,
    "country/es": 0,
    "country/fa": 0,
    "country/fr": 3,
    "country/hi": 0,
    "country/id": 0,
    "country/it": 0,
    "country/ja": 0,
    "country/ko": 0,
    "country/mr": 0,
    "country/pt": 0,
    "country/ru": 0,
    "country/sw": 1,
    "country/ta": 0,
    "country/te": 0,
    "country/tr": 0,
    "country/ur": 0,
    "country/vi": 1,
    "country/zh": 0,
    "location/ar": 0,
    "location/bn": 0,
    "location/de": 2,
    "location/en": 2,
    "location/es": 1,
    "location/fa": 0,
    "location/fr": 5,
    "location/hi": 1,
    "location/id": 0,
    "location/it": 1,
    "location/ja": 0,
    "location/ko": 0,
    "location/mr": 0,
    "location/pt": 1,
    "location/ru": 0,
    "location/sw": 0,
    "location/ta": 0,
    "location/te": 0,
    "location/tr": 0,
    "location/ur": 0,
    "location/vi": 0,
    "location/zh": 0
  },
  "filter_ratios": {
    "combined": 0.5,
    "country/ar": 0,
    "country/bn": 0,
    "country/de": 0,
    "country/en": 0.09090909090909091,
    "country/es": 0,
    "country/fa": 0,
    "country/fr": 0.13636363636363635,
    "country/hi": 0,
    "country/id": 0,
    "country/it": 0,
    "country/ja": 0,
    "country/ko": 0,
    "country/mr": 0,
    "country/pt": 0,
    "country/ru": 0,
    "country/sw": 0.045454545454545456,
    "country/ta": 0,
    "country/te": 0,
    "country/tr": 0,
    "country/ur": 0,
    "country/vi": 0.045454545454545456,
    "country/zh": 0,
    "location/ar": 0,
    "location/bn": 0,
    "location/de": 0.09090909090909091,
    "location/en": 0.09090909090909091,
    "location/es": 0.045454545454545456,
    "location/fa": 0,
    "location/fr": 0.22727272727272727,
    "location/hi": 0.045454545454545456,
    "location/id": 0,
    "location/it": 0.045454545454545456,
    "location/ja": 0,
    "location/ko": 0,
    "location/mr": 0,
    "location/pt": 0.045454545454545456,
    "location/ru": 0,
    "location/sw": 0,
    "location/ta": 0,
    "location/te": 0,
    "location/tr": 0,
    "location/ur": 0,
    "location/vi": 0,
    "location/zh": 0
  },
  "text": "Paris is in France and Paris has cafes. Rue de la... Adress... pobox... addresse avenue bricolage rue rue rue rue belgique luxembourg",
  "token_count": 22,
  "top_filters": [
    {
      "count": 11,
      "filter": "combined",
      "matching_words": [
        {
          "count": 5,
          "word": "rue"
        },
        {
          "count": 1,
          "word": "avenue"
        },
        {
          "count": 1,
          "word": "belgique"
        },
        {
          "count": 1,
          "word": "de"
        },
        {
          "count": 1,
          "word": "france"
        },
        {
          "count": 1,
          "word": "in"
        },
        {
          "count": 1,
          "word": "luxembourg"
        }
      ],
      "ratio": 0.5
    },
    {
      "count": 5,
      "filter": "location/fr",
      "matching_words": [
        {
          "count": 5,
          "word": "rue"
        }
      ],
      "ratio": 0.22727272727272727
    },
    {
      "count": 3,
      "filter": "country/fr",
      "matching_words": [
        {
          "count": 1,
          "word": "belgique"
        },
        {
          "count": 1,
          "word": "france"
        },
        {
          "count": 1,
          "word": "luxembourg"
        }
      ],
      "ratio": 0.13636363636363635
    },
    {
      "count": 2,
      "filter": "country/en",
      "matching_words": [
        {
          "count": 1,
          "word": "france"
        },
        {
          "count": 1,
          "word": "luxembourg"
        }
      ],
      "ratio": 0.09090909090909091
    },
    {
      "count": 2,
      "filter": "location/de",
      "matching_words": [
        {
          "count": 1,
          "word": "avenue"
        },
        {
          "count": 1,
          "word": "in"
        }
      ],
      "ratio": 0.09090909090909091
    }
  ],
  "unique_token_count": 17
}

Work in Progress for BERT based location finder, including multilingual mock location dataset:

https://github.com/ail-project/bertlocation

1 Like

Further Ideas on Location Finding:

1 Like

Some clean-up to improve the different filter list.

Remove stop-words per language

../topic/country/ru.txt (removed 2 stop words)
../topic/country/tr.txt (removed 1 stop words)
../topic/geonames/af.txt (removed 12 stop words)
../topic/geonames/ar.txt (removed 0 stop words)
../topic/geonames/bg.txt (removed 5 stop words)
../topic/geonames/br.txt (removed 56 stop words)
../topic/geonames/ca.txt (removed 8 stop words)
../topic/geonames/de.txt (removed 126 stop words)
../topic/geonames/es.txt (removed 102 stop words)
../topic/geonames/et.txt (removed 6 stop words)
../topic/geonames/fi.txt (removed 19 stop words)
../topic/geonames/fr.txt (removed 291 stop words)
../topic/geonames/ga.txt (removed 3 stop words)
../topic/geonames/gl.txt (removed 1 stop words)
../topic/geonames/hr.txt (removed 5 stop words)
../topic/geonames/hu.txt (removed 13 stop words)
../topic/geonames/id.txt (removed 152 stop words)
../topic/geonames/it.txt (removed 164 stop words)
../topic/geonames/la.txt (removed 1 stop words)
../topic/geonames/lt.txt (removed 5 stop words)
../topic/geonames/lv.txt (removed 0 stop words)
../topic/geonames/mr.txt (removed 1 stop words)
../topic/geonames/nl.txt (removed 32 stop words)
../topic/geonames/no.txt (removed 36 stop words)
../topic/geonames/pl.txt (removed 11 stop words)
../topic/geonames/pt.txt (removed 37 stop words)
../topic/geonames/ro.txt (removed 28 stop words)
../topic/geonames/ru.txt (removed 115 stop words)
../topic/geonames/sk.txt (removed 15 stop words)
../topic/geonames/sl.txt (removed 11 stop words)
../topic/geonames/so.txt (removed 1 stop words)
../topic/geonames/st.txt (removed 1 stop words)
../topic/geonames/th.txt (removed 3 stop words)
../topic/geonames/tr.txt (removed 41 stop words)
../topic/location/ar.txt (removed 7 stop words)
../topic/location/bn.txt (removed 10 stop words)
../topic/location/de.txt (removed 21 stop words)
../topic/location/en.txt (removed 41 stop words)
../topic/location/es.txt (removed 25 stop words)
../topic/location/fa.txt (removed 19 stop words)
../topic/location/fr.txt (removed 26 stop words)
../topic/location/hi.txt (removed 10 stop words)
../topic/location/id.txt (removed 29 stop words)
../topic/location/it.txt (removed 25 stop words)
../topic/location/ja.txt (removed 4 stop words)
../topic/location/ko.txt (removed 5 stop words)
../topic/location/mr.txt (removed 1 stop words)
../topic/location/pt.txt (removed 29 stop words)
../topic/location/ru.txt (removed 33 stop words)
../topic/location/sw.txt (removed 10 stop words)
../topic/location/tr.txt (removed 6 stop words)
../topic/location/ur.txt (removed 4 stop words)
../topic/location/vi.txt (removed 18 stop words)
../topic/location/zh.txt (removed 8 stop words)

Remove numerical values