Intro to Elasticsearch

GitHub Page

Intro

  • Download Elasticsearch tar https://www.elastic.co/start examples are using version 5.4.1
  • Extract tar into directory
  • Start elasticsearch
./bin/elasticsearch
#or
./bin/elasticsearch -Ecluster.name=the_cluster -Enode.name=the_node

cURL

 curl -XGET 'localhost:9200/_cat/indices?v&pretty'
 health status index uuid pri rep docs.count docs.deleted store.size pri.store.size


 curl -XGET 'localhost:9200/_cat/nodes?v&pretty'
 ip        heap.percent ram.percent cpu load_1m load_5m load_15m node.role master name
127.0.0.1            6          99  43    1.72    1.69     2.46 mdi       *      the_node


 curl -XGET 'localhost:9200/_cat/health?v&pretty'
ip        heap.percent ram.percent cpu load_1m load_5m load_15m node.role master name
127.0.0.1            6          99  43    1.72    1.69     2.46 mdi       *      the_node

Create a new index

curl -XPUT 'localhost:9200/products?&pretty'
# response
{
  "acknowledged" : true,
  "shards_acknowledged" : true
}

Check new index

# list indices
curl -XGET 'localhost:9200/_cat/indices?v&pretty'
health status index    uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   products nKzxKcEVSoC2OoKtL_HLaQ   5   1          0            0       650b           650b

PUT: create and update resources, POST: only to update resources

Every index in Elasticsearch has 5 shards and 1 replica by default.

Create mode new indices

curl -XPUT 'localhost:9200/customers?&pretty'
curl -XPUT 'localhost:9200/orders?&pretty'

Check indices

curl -XGET 'localhost:9200/_cat/indices?v&pretty'
health status index     uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   products  nKzxKcEVSoC2OoKtL_HLaQ   5   1          0            0       650b           650b
yellow open   customers D8OHORC3T-GHpyJDmCgEnA   5   1          0            0       650b           650b
yellow open   orders    JOxpIn6DRuur-MHDO4h6ZQ   5   1          0            0       650b           650b

Add documents to index

-d is for data

curl -XPUT 'localhost:9200/products/mobiles/1?pretty' -d'
{
"name" : "iPhone",
"camera" : "12MP",
"reviews" : ["Incredibly happy to use it", "I like it a lot"]
}'

# response
{
  "_index" : "products",
  "_type" : "mobiles",
  "_id" : "1",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 2,
    "successful" : 1,
    "failed" : 0
  },
  "created" : true
}

And create anothe rone

curl -XPUT 'localhost:9200/products/mobiles/2?pretty' -d'
{
"name" : "Samsung Galaxy",
"camera" : "8MP",
"reviews" : ["A bit too big for me", "I will sell it soon"]
}'

# response
{
  "_index" : "products",
  "_type" : "mobiles",
  "_id" : "2",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 2,
    "successful" : 1,
    "failed" : 0
  },
  "created" : true
}

Create a laprop category within products
Autogenerate the ID!

curl -XPOST 'localhost:9200/products/laptops?pretty' -d'
{
"name" : "Macbook Pro",
"storage" : "500GB",
"reviews" : ["Incredibly happy to use it", "I like it a lot"]
}'

# response
{
  "_index" : "products",
  "_type" : "laptops",
  "_id" : "AWDDl4oPUqJGY70jJJ21",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 2,
    "successful" : 1,
    "failed" : 0
  },
  "created" : true
}

List indices


curl -XGET 'localhost:9200/_cat/indices?v&pretty' health status index uuid pri rep docs.count docs.deleted store.size pri.store.size yellow open products nKzxKcEVSoC2OoKtL_HLaQ 5 1 4 0 19.2kb 19.2kb yellow open customers D8OHORC3T-GHpyJDmCgEnA 5 1 0 0 650b 650b yellow open orders JOxpIn6DRuur-MHDO4h6ZQ 5 1 0 0 650b 650b

Retrieve whole or partial documents

One document

define index, document type and index

curl -XGET 'localhost:9200/products/mobiles/1?pretty'


{
  "_index" : "products",
  "_type" : "mobiles",
  "_id" : "1",
  "_version" : 1,
  "found" : true,
  "_source" : {
    "name" : "iPhone",
    "camera" : "12MP",
    "reviews" : [
      "Incredibly happy to use it",
      "I like it a lot"
    ]
  }
}

Don’t include source in the response

curl -XGET 'localhost:9200/products/mobiles/1?pretty&_source=false'
{
  "_index" : "products",
  "_type" : "mobiles",
  "_id" : "1",
  "_version" : 1,
  "found" : true
}

Retrieve only certain fileds

curl -XGET 'localhost:9200/products/mobiles/1?pretty&_source=name,camera'
{
  "_index" : "products",
  "_type" : "mobiles",
  "_id" : "1",
  "_version" : 1,
  "found" : true,
  "_source" : {
    "name" : "iPhone",
    "camera" : "12MP"
  }
}

Updating whole document

Before

curl -XGET 'localhost:9200/products/mobiles/2?pretty'
{
  "_index" : "products",
  "_type" : "mobiles",
  "_id" : "2",
  "_version" : 1,
  "found" : true,
  "_source" : {
    "name" : "Samsung Galaxy",
    "camera" : "8MP",
    "reviews" : [
      "A bit too big for me",
      "I will sell it soon"
    ]
  }
}

Change document

curl -XPUT 'localhost:9200/products/mobiles/2?pretty' -d'
{
    "name" : "Samsung Big One",
    "camera" : "16MP",
    "reviews" : [
      "Love it",
      "I will sell it soon"
    ]
  }
'


# response
{
  "_index" : "products",
  "_type" : "mobiles",
  "_id" : "2",
  "_version" : 2,
  "result" : "updated",
  "_shards" : {
    "total" : 2,
    "successful" : 1,
    "failed" : 0
  },
  "created" : false
}

After

curl -XGET 'localhost:9200/products/mobiles/2?pretty'
{
  "_index" : "products",
  "_type" : "mobiles",
  "_id" : "2",
  "_version" : 2,
  "found" : true,
  "_source" : {
    "name" : "Samsung Big One",
    "camera" : "16MP",
    "reviews" : [
      "Love it",
      "I will sell it soon"
    ]
  }
}

Updating partial document

_update API, uses the POST command with a “doc” field

curl -XPOST 'localhost:9200/products/mobiles/2/_update?pretty' -d'
{
"doc" : {"color": "black"}
}
'

Check update

curl -XGET 'localhost:9200/products/mobiles/2?pretty'
{
  "_index" : "products",
  "_type" : "mobiles",
  "_id" : "2",
  "_version" : 3,
  "found" : true,
  "_source" : {
    "name" : "Samsung Big One",
    "camera" : "16MP",
    "reviews" : [
      "Love it",
      "I will sell it soon"
    ],
    "color" : "black"
  }
}

Update the number in the doc

curl -XPOST 'localhost:9200/products/budget/1/_update?pretty' -d'
{
"script" :  "ctx._source.money +=2"
}

Deleting Document

curl -XDELETE 'localhost:9200/products/mobiles/2?pretty'

# response
{
  "found" : true,
  "_index" : "products",
  "_type" : "mobiles",
  "_id" : "2",
  "_version" : 4,
  "result" : "deleted",
  "_shards" : {
    "total" : 2,
    "successful" : 1,
    "failed" : 0
  }
}

Check if document exists – check server response

curl -i -XHEAD 'localhost:9200/products/mopbiles/2?pretty'

Delete Index

curl -XDELETE 'localhost:9200/orders?pretty'

#response
{
  "acknowledged" : true
}


Bulk operations – retrieve multiple records

curl -XGET 'localhost:9200/_mget?pretty' -d'
{
"docs" : [
{ "_index" : "products",
"_type" : "mobiles",
"_id" : "1"
},
{ "_index" : "products",
"_type" : "mobiles",
"_id" : "2"
}
]
}

'
{
  "docs" : [
    {
      "_index" : "products",
      "_type" : "mobiles",
      "_id" : "1",
      "_version" : 1,
      "found" : true,
      "_source" : {
        "name" : "iPhone",
        "camera" : "12MP",
        "reviews" : [
          "Incredibly happy to use it",
          "I like it a lot"
        ]
      }
    },
    {
      "_index" : "products",
      "_type" : "mobiles",
      "_id" : "2",
      "found" : false
    }
  ]
}

'

Shorter version

curl -XGET 'localhost:9200/products/mobiles/_mget?pretty' -d'
{
"docs" : [
{ 
"_id" : "1"
},
{ 
"_id" : "2"
}
]
}
'

# response
{
  "docs" : [
    {
      "_index" : "products",
      "_type" : "mobiles",
      "_id" : "1",
      "_version" : 1,
      "found" : true,
      "_source" : {
        "name" : "iPhone",
        "camera" : "12MP",
        "reviews" : [
          "Incredibly happy to use it",
          "I like it a lot"
        ]
      }
    },
    {
      "_index" : "products",
      "_type" : "mobiles",
      "_id" : "2",
      "found" : false
    }
  ]
}

Import documents from JSON

# json file
{"index": {}}
{"name": "XQ", "age" : "44"}
curl -H "Content Type: application/x-ndjson" -XPOST 'localhost:9200/customers/personal/_bulk?pretty&refresh" data-binary @"customers.json"'

How search works

  • Knows the document exists – Web Crawler
  • Index the document for lookup – Inverted index
  • Know how relevant is the document – Scoring
  • Retrieve results ranked by relevance – Search

The Query DSL

https://www.elastic.co/guide/en/elasticsearch/guide/current/query-dsl-intro.html

Search language that Elasticsearch uses. You should use it in producton to write your queries

2 contexts
* How well does this document match the query? QUERY CONTEXT
* Does this document match this query clause? FILTER CONTEXT

Query context

  • is the document included in the result or not
  • Relevance score calculated for every search term the document maps to

High score, more relevant

Filter context

  • Is the document included in the result or not
  • No scoring of individual documents
  • Used on structured data for exact matches
  • Queries are faster and more performant as there is no scoring to consider

Generate fake JSON data

1000 customers
https://www.json-generator.com/

[
  {
    'repeat(100, 100)': {

      name: '{{firstName()}} {{surname()}}',
      page: '{{integer(18,55)}}',
      phone: '+1 {{phone()}}',
      city: '{{city()}}'

    }
  }
]

Prepare JSON file for import

  • remove beginning [ and end ]
  • regex replace },{ with }\n{
  • add index field before every item
Find 
{"name"
Replace with: 
{"index":{}} \n{"name"

Import data

Delete customers index

curl -XDELETE 'localhost:9200/customers?pretty'

Import data

curl -H "Content-Type: application/x-ndjson" -XPOST 'localhost:9200/customers/personal/_bulk?pretty&refresh' --data-binary @"customers.json"

Check


curl -XGET 'localhost:9200/_cat/indices?v&pretty'

Search using request body – get all documents

  • relevance score is not calculated
  • stateless searches
curl -XGET 'localhost:9200/products/_search?pretty' -d'
{
"query" : {"match_all" : {} }
}
'
curl -XGET 'localhost:9200/customers/_search?pretty' -d'
{
"query" : {"match_all" : {} }
}
'

# response 
{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 999,
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "customers",
        "_type" : "personal",
        "_id" : "AWDD7B-vUqJGY70jJJ3C",
        "_score" : 1.0,
        "_source" : {
          "name" : "Imelda Mcdaniel",
          "age" : 68,
          "gender" : "female",
          "email" : "imeldamcdaniel@talkalot.com",
          "phone" : "+1 (856) 480-3825",

change the number of results

curl -XGET 'localhost:9200/customers/_search?pretty' -d'
{
"query" : {"match_all" : {} },
"size" : 3
}
'

Sort order

curl -XGET 'localhost:9200/customers/_search?pretty' -d'
{
"query" : {"match_all" : {} },
"sort" : {"age" : {"order": "desc"}},
"size" : 5
}
'

Source filtering

    • relevance score was calculated for each match
curl -XGET 'localhost:9200/customers/_search?pretty' -d'
{
"query" : {"term" : { "name" : "gates" } }
}
'

# response
{
  "took" : 3,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 2,
    "max_score" : 4.429491,
    "hits" : [
...

Filter results but don’t show sources

curl -XGET 'localhost:9200/customers/_search?pretty' -d'
{
"_source" : false,
"query" : {"term" : { "name" : "gates" } }
}
'

# response 
{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 2,
    "max_score" : 4.429491,
    "hits" : [
      {
        "_index" : "customers",
        "_type" : "personal",
        "_id" : "AWDD7B-xUqJGY70jJJ_7",
        "_score" : 4.429491
      },
      {
        "_index" : "customers",
        "_type" : "personal",
        "_id" : "AWDD7B-wUqJGY70jJJ5z",
        "_score" : 4.3718715
      }
    ]
  }
}


Retrieve only defined fields

curl -XGET 'localhost:9200/customers/_search?pretty' -d'
{
"_source" : "st*",  # or [ "st*", "*n*"]
"query" : {"term" : { "name" : "gates" } }
}
'


# response 
{
  "took" : 2,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 2,
    "max_score" : 4.429491,
    "hits" : [
      {
        "_index" : "customers",
        "_type" : "personal",
        "_id" : "AWDD7B-xUqJGY70jJJ_7",
        "_score" : 4.429491,
        "_source" : {
          "street" : "934 Strong Place",
          "state" : "Nevada, 7571"
        }
      },
      {
        "_index" : "customers",
        "_type" : "personal",
        "_id" : "AWDD7B-wUqJGY70jJJ5z",
        "_score" : 4.3718715,
        "_source" : {
          "street" : "194 Columbia Place",
          "state" : "Ohio, 3096"
        }
      }
    ]
  }
}

Includes / Excludes

curl -XGET 'localhost:9200/customers/_search?pretty' -d'
{
"_source" :  {
            "includes" :  ["st*", "*n*"],
            "excludes" : ["*der"]
            }
"query" : {"term" : { "name" : "gates" } }
}
'

Full text fields

Queries using:
* match
* match_phrase
* match_phrase_prefix

curl -XGET 'localhost:9200/customers/_search?pretty' -d'
{
"query" : {
            "match" : 
            { "name" : "webster" } }
}
'

curl -XGET 'localhost:9200/customers/_search?pretty' -d'
{
"query" : {
            "match" : {
                    "name" :    {
                                "query" : "frank norris",
                                "operator" : "or"
                    }
        }
}
'

SOURCE:
Pluralsight – Searching and Analyzing Data with Elasticsearch: Getting Started