CHORUS/vendor/github.com/blevesearch/geo/s2/region_term_indexer.go

// Copyright 2021 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//
// Indexing Strategy
// -----------------
//
// Given a query region, we want to find all of the document regions that
// intersect it.  The first step is to represent all the regions as S2Cell
// coverings (see S2RegionCoverer).  We then split the problem into two parts,
// namely finding the document regions that are "smaller" than the query
// region and those that are "larger" than the query region.
//
// We do this by defining two terms for each S2CellId: a "covering term" and
// an "ancestor term".  (In the implementation below, covering terms are
// distinguished by prefixing a '$' to them.)  For each document region, we
// insert a covering term for every cell in the region's covering, and we
// insert an ancestor term for these cells *and* all of their ancestors.
//
// Then given a query region, we can look up all the document regions that
// intersect its covering by querying the union of the following terms:
//
// 1. An "ancestor term" for each cell in the query region.  These terms
//    ensure that we find all document regions that are "smaller" than the
//    query region, i.e. where the query region contains a cell that is either
//    a cell of a document region or one of its ancestors.
//
// 2. A "covering term" for every ancestor of the cells in the query region.
//    These terms ensure that we find all the document regions that are
//    "larger" than the query region, i.e. where document region contains a
//    cell that is a (proper) ancestor of a cell in the query region.
//
// Together, these terms find all of the document regions that intersect the
// query region.  Furthermore, the number of terms to be indexed and queried
// are both fairly small, and can be bounded in terms of max_cells() and the
// number of cell levels used.
//
// Optimizations
// -------------
//
// + Cells at the maximum level being indexed (max_level()) have the special
//   property that they will never be an ancestor of a cell in the query
//   region.  Therefore we can safely skip generating "covering terms" for
//   these cells (see query step 2 above).
//
// + If the index will contain only points (rather than general regions), then
//   we can skip all the covering terms mentioned above because there will
//   never be any document regions larger than the query region.  This can
//   significantly reduce the size of queries.
//
// + If it is more important to optimize index size rather than query speed,
//   the number of index terms can be reduced by creating ancestor terms only
//   for the *proper* ancestors of the cells in a document region, and
//   compensating for this by including covering terms for all cells in the
//   query region (in addition to their ancestors).
//
//   Effectively, when the query region and a document region contain exactly
//   the same cell, we have a choice about whether to treat this match as a
//   "covering term" or an "ancestor term".  One choice minimizes query size
//   while the other minimizes index size.

package s2

import (
	"strings"

	"github.com/blevesearch/geo/s1"
)

type TermType int

var marker = string('$')

const (
	ANCESTOR TermType = iota + 1
	COVERING
)

var defaultMaxCells = int(8)

type Options struct {
	///////////////// Options Inherited From S2RegionCoverer ////////////////

	// maxCells controls the maximum number of cells when approximating
	// each region.  This parameter value may be changed as often as desired.
	// e.g. to approximate some regions more accurately than others.
	//
	// Increasing this value during indexing will make indexes more accurate
	// but larger.  Increasing this value for queries will make queries more
	// accurate but slower.  (See regioncoverer.go for details on how this
	// parameter affects accuracy.)  For example, if you don't mind large
	// indexes but want fast serving, it might be reasonable to set
	// max_cells() == 100 during indexing and max_cells() == 8 for queries.
	//
	// DEFAULT: 8  (coarse approximations)
	maxCells int

	// minLevel and maxLevel control the minimum and maximum size of the
	// S2Cells used to approximate regions.  Setting these parameters
	// appropriately can reduce the size of the index and speed up queries by
	// reducing the number of terms needed.  For example, if you know that
	// your query regions will rarely be less than 100 meters in width, then
	// you could set maxLevel to 100.
	//
	// This restricts the index to S2Cells that are approximately 100 meters
	// across or larger.  Similar, if you know that query regions will rarely
	// be larger than 1000km across, then you could set minLevel similarly.
	//
	// If minLevel is set too high, then large regions may generate too
	// many query terms.  If maxLevel() set too low, then small query
	// regions will not be able to discriminate which regions they intersect
	// very precisely and may return many more candidates than necessary.
	//
	// If you have no idea about the scale of the regions being queried,
	// it is perfectly fine to set minLevel to 0 and maxLevel to 30.
	// The only drawback is that may result in a larger index and slower queries.
	//
	// The default parameter values are suitable for query regions ranging
	// from about 100 meters to 3000 km across.
	//
	// DEFAULT: 4  (average cell width == 600km)
	minLevel int

	// DEFAULT: 16 (average cell width == 150m)
	maxLevel int

	// Setting levelMod to a value greater than 1 increases the effective
	// branching factor of the S2Cell hierarchy by skipping some levels.  For
	// example, if levelMod to 2 then every second level is skipped (which
	// increases the effective branching factor to 16).  You might want to
	// consider doing this if your query regions are typically very small
	// (e.g., single points) and you don't mind increasing the index size
	// (since skipping levels will reduce the accuracy of cell coverings for a
	// given maxCells limit).
	//
	// DEFAULT: 1  (don't skip any cell levels)
	levelMod int

	// If your index will only contain points (rather than regions), be sure
	// to set this flag.  This will generate smaller and faster queries that
	// are specialized for the points-only case.
	//
	// With the default quality settings, this flag reduces the number of
	// query terms by about a factor of two.  (The improvement gets smaller
	// as maxCells is increased, but there is really no reason not to use
	// this flag if your index consists entirely of points.)
	//
	// DEFAULT: false
	pointsOnly bool

	// If true, the index will be optimized for space rather than for query
	// time.  With the default quality settings, this flag reduces the number
	// of index terms and increases the number of query terms by the same
	// factor (approximately 1.3).  The factor increases up to a limiting
	// ratio of 2.0 as maxCells is increased.
	//
	// CAVEAT: This option has no effect if the index contains only points.
	//
	// DEFAULT: false
	optimizeSpace bool
}

func (o *Options) MaxCells() int {
	return o.maxCells
}

func (o *Options) SetMaxCells(mc int) {
	o.maxCells = mc
}

func (o *Options) MinLevel() int {
	return o.minLevel
}

func (o *Options) SetMinLevel(ml int) {
	o.minLevel = ml
}

func (o *Options) MaxLevel() int {
	return o.maxLevel
}

func (o *Options) SetMaxLevel(ml int) {
	o.maxLevel = ml
}

func (o *Options) LevelMod() int {
	return o.levelMod
}

func (o *Options) SetLevelMod(lm int) {
	o.levelMod = lm
}

func (o *Options) SetPointsOnly(v bool) {
	o.pointsOnly = v
}

func (o *Options) SetOptimizeSpace(v bool) {
	o.optimizeSpace = v
}

func (o *Options) trueMaxLevel() int {
	trueMax := o.maxLevel
	if o.levelMod != 1 {
		trueMax = o.maxLevel - (o.maxLevel-o.minLevel)%o.levelMod
	}
	return trueMax
}

// RegionTermIndexer is a helper struct for adding spatial data to an
// information retrieval system.  Such systems work by converting documents
// into a collection of "index terms" (e.g., representing words or phrases),
// and then building an "inverted index" that maps each term to a list of
// documents (and document positions) where that term occurs.
//
// This class deals with the problem of converting spatial data into index
// terms, which can then be indexed along with the other document information.
//
// Spatial data is represented using the S2Region type.  Useful S2Region
// subtypes include:
//
//   S2Cap
//    - a disc-shaped region
//
//   S2LatLngRect
//    - a rectangle in latitude-longitude coordinates
//
//   S2Polyline
//    - a polyline
//
//   S2Polygon
//    - a polygon, possibly with multiple holes and/or shells
//
//   S2CellUnion
//    - a region approximated as a collection of S2CellIds
//
//   S2ShapeIndexRegion
//    - an arbitrary collection of points, polylines, and polygons
//
//   S2ShapeIndexBufferedRegion
//    - like the above, but expanded by a given radius
//
//   S2RegionUnion, S2RegionIntersection
//    - the union or intersection of arbitrary other regions
//
// So for example, if you want to query documents that are within 500 meters
// of a polyline, you could use an S2ShapeIndexBufferedRegion containing the
// polyline with a radius of 500 meters.
//
// For example usage refer:
// https://github.com/google/s2geometry/blob/ad1489e898f369ca09e2099353ccd55bd0fd7a26/src/s2/s2region_term_indexer.h#L58

type RegionTermIndexer struct {
	options       Options
	regionCoverer RegionCoverer
}

func NewRegionTermIndexer() *RegionTermIndexer {
	rv := &RegionTermIndexer{
		options: Options{
			maxCells: 8,
			minLevel: 4,
			maxLevel: 16,
			levelMod: 1,
		},
	}
	return rv
}

func NewRegionTermIndexerWithOptions(option Options) *RegionTermIndexer {
	return &RegionTermIndexer{options: option}
}

func (rti *RegionTermIndexer) GetTerm(termTyp TermType, id CellID,
	prefix string) string {
	if termTyp == ANCESTOR {
		return prefix + id.ToToken()
	}
	return prefix + marker + id.ToToken()
}

func (rti *RegionTermIndexer) GetIndexTermsForPoint(p Point, prefix string) []string {
	// See the top of this file for an overview of the indexing strategy.
	//
	// The last cell generated by this loop is effectively the covering for
	// the given point.  You might expect that this cell would be indexed as a
	// covering term, but as an optimization we always index these cells as
	// ancestor terms only.  This is possible because query regions will never
	// contain a descendant of such cells.  Note that this is true even when
	// max_level() != true_max_level() (see S2RegionCoverer::Options).
	cellID := cellIDFromPoint(p)
	var rv []string
	for l := rti.options.minLevel; l <= rti.options.maxLevel; l += rti.options.levelMod {
		rv = append(rv, rti.GetTerm(ANCESTOR, cellID.Parent(l), prefix))
	}
	return rv
}

func (rti *RegionTermIndexer) GetIndexTermsForRegion(region Region,
	prefix string) []string {
	rti.regionCoverer.LevelMod = rti.options.levelMod
	rti.regionCoverer.MaxLevel = rti.options.maxLevel
	rti.regionCoverer.MinLevel = rti.options.minLevel
	rti.regionCoverer.MaxCells = rti.options.maxCells

	covering := rti.regionCoverer.Covering(region)
	return rti.GetIndexTermsForCanonicalCovering(covering, prefix)
}

func (rti *RegionTermIndexer) GetIndexTermsForCanonicalCovering(
	covering CellUnion, prefix string) []string {
	// See the top of this file for an overview of the indexing strategy.
	//
	// Cells in the covering are normally indexed as covering terms.  If we are
	// optimizing for query time rather than index space, they are also indexed
	// as ancestor terms (since this lets us reduce the number of terms in the
	// query).  Finally, as an optimization we always index true_max_level()
	// cells as ancestor cells only, since these cells have the special property
	// that query regions will never contain a descendant of these cells.
	var rv []string
	prevID := CellID(0)
	tml := rti.options.trueMaxLevel()

	for _, cellID := range covering {
		level := cellID.Level()
		if level < tml {
			rv = append(rv, rti.GetTerm(COVERING, cellID, prefix))
		}

		if level == tml || !rti.options.optimizeSpace {
			rv = append(rv, rti.GetTerm(ANCESTOR, cellID.Parent(level), prefix))
		}

		for (level - rti.options.levelMod) >= rti.options.minLevel {
			level -= rti.options.levelMod
			ancestorID := cellID.Parent(level)
			if prevID != CellID(0) && prevID.Level() > level &&
				prevID.Parent(level) == ancestorID {
				break
			}
			rv = append(rv, rti.GetTerm(ANCESTOR, ancestorID, prefix))
		}
		prevID = cellID
	}

	return rv
}

func (rti *RegionTermIndexer) GetQueryTermsForPoint(p Point, prefix string) []string {
	cellID := cellIDFromPoint(p)
	var rv []string

	level := rti.options.trueMaxLevel()
	rv = append(rv, rti.GetTerm(ANCESTOR, cellID.Parent(level), prefix))
	if rti.options.pointsOnly {
		return rv
	}

	for level >= rti.options.minLevel {
		rv = append(rv, rti.GetTerm(COVERING, cellID.Parent(level), prefix))
		level -= rti.options.levelMod
	}

	return rv
}

func (rti *RegionTermIndexer) GetQueryTermsForRegion(region Region,
	prefix string) []string {
	rti.regionCoverer.LevelMod = rti.options.levelMod
	rti.regionCoverer.MaxLevel = rti.options.maxLevel
	rti.regionCoverer.MinLevel = rti.options.minLevel
	rti.regionCoverer.MaxCells = rti.options.maxCells

	covering := rti.regionCoverer.Covering(region)
	return rti.GetQueryTermsForCanonicalCovering(covering, prefix)

}

func (rti *RegionTermIndexer) GetQueryTermsForCanonicalCovering(
	covering CellUnion, prefix string) []string {
	var rv []string
	prevID := CellID(0)
	tml := rti.options.trueMaxLevel()
	for _, cellID := range covering {
		level := cellID.Level()
		rv = append(rv, rti.GetTerm(ANCESTOR, cellID, prefix))

		if rti.options.pointsOnly {
			continue
		}

		if rti.options.optimizeSpace && level < tml {
			rv = append(rv, rti.GetTerm(COVERING, cellID, prefix))
		}

		for level-rti.options.levelMod >= rti.options.minLevel {
			level -= rti.options.levelMod
			ancestorID := cellID.Parent(level)
			if prevID != CellID(0) && prevID.Level() > level &&
				prevID.Parent(level) == ancestorID {
				break
			}
			rv = append(rv, rti.GetTerm(COVERING, ancestorID, prefix))
		}

		prevID = cellID
	}

	return rv
}

func CapFromCenterAndRadius(centerLat, centerLon, dist float64) Cap {
	return CapFromCenterAngle(PointFromLatLng(
		LatLngFromDegrees(centerLat, centerLon)), s1.Angle((dist/1000)/6378))
}

// FilterOutCoveringTerms filters out the covering terms so that
// it helps to reduce the search terms while searching in a one
// dimensional space. (point only indexing usecase)
func FilterOutCoveringTerms(terms []string) []string {
	rv := make([]string, 0, len(terms))
	for _, term := range terms {
		if strings.HasPrefix(term, marker) {
			continue
		}
		rv = append(rv, term)
	}
	return rv
}