+ * -foo +bar baz
+ */
+
+ /**
+ * Performs a search against the index using lunr query syntax.
+ *
+ * Results will be returned sorted by their score, the most relevant results
+ * will be returned first. For details on how the score is calculated, please see
+ * the {@link https://lunrjs.com/guides/searching.html#scoring|guide}.
+ *
+ * For more programmatic querying use lunr.Index#query.
+ *
+ * @param {lunr.Index~QueryString} queryString - A string containing a lunr query.
+ * @throws {lunr.QueryParseError} If the passed query string cannot be parsed.
+ * @returns {lunr.Index~Result[]}
+ */
+ lunr.Index.prototype.search = function (queryString) {
+ return this.query(function (query) {
+ var parser = new lunr.QueryParser(queryString, query);
+ parser.parse();
+ });
+ };
+
+ /**
+ * A query builder callback provides a query object to be used to express
+ * the query to perform on the index.
+ *
+ * @callback lunr.Index~queryBuilder
+ * @param {lunr.Query} query - The query object to build up.
+ * @this lunr.Query
+ */
+
+ /**
+ * Performs a query against the index using the yielded lunr.Query object.
+ *
+ * If performing programmatic queries against the index, this method is preferred
+ * over lunr.Index#search so as to avoid the additional query parsing overhead.
+ *
+ * A query object is yielded to the supplied function which should be used to
+ * express the query to be run against the index.
+ *
+ * Note that although this function takes a callback parameter it is _not_ an
+ * asynchronous operation, the callback is just yielded a query object to be
+ * customized.
+ *
+ * @param {lunr.Index~queryBuilder} fn - A function that is used to build the query.
+ * @returns {lunr.Index~Result[]}
+ */
+ lunr.Index.prototype.query = function (fn) {
+ // for each query clause
+ // * process terms
+ // * expand terms from token set
+ // * find matching documents and metadata
+ // * get document vectors
+ // * score documents
+
+ var query = new lunr.Query(this.fields),
+ matchingFields = Object.create(null),
+ queryVectors = Object.create(null),
+ termFieldCache = Object.create(null),
+ requiredMatches = Object.create(null),
+ prohibitedMatches = Object.create(null);
+
+ /*
+ * To support field level boosts a query vector is created per
+ * field. An empty vector is eagerly created to support negated
+ * queries.
+ */
+ for (var i = 0; i < this.fields.length; i++) {
+ queryVectors[this.fields[i]] = new lunr.Vector();
+ }
+
+ fn.call(query, query);
+
+ for (var i = 0; i < query.clauses.length; i++) {
+ /*
+ * Unless the pipeline has been disabled for this term, which is
+ * the case for terms with wildcards, we need to pass the clause
+ * term through the search pipeline. A pipeline returns an array
+ * of processed terms. Pipeline functions may expand the passed
+ * term, which means we may end up performing multiple index lookups
+ * for a single query term.
+ */
+ var clause = query.clauses[i],
+ terms = null,
+ clauseMatches = lunr.Set.complete;
+
+ if (clause.usePipeline) {
+ terms = this.pipeline.runString(clause.term, {
+ fields: clause.fields,
+ });
+ } else {
+ terms = [clause.term];
+ }
+
+ for (var m = 0; m < terms.length; m++) {
+ var term = terms[m];
+
+ /*
+ * Each term returned from the pipeline needs to use the same query
+ * clause object, e.g. the same boost and or edit distance. The
+ * simplest way to do this is to re-use the clause object but mutate
+ * its term property.
+ */
+ clause.term = term;
+
+ /*
+ * From the term in the clause we create a token set which will then
+ * be used to intersect the indexes token set to get a list of terms
+ * to lookup in the inverted index
+ */
+ var termTokenSet = lunr.TokenSet.fromClause(clause),
+ expandedTerms = this.tokenSet.intersect(termTokenSet).toArray();
+
+ /*
+ * If a term marked as required does not exist in the tokenSet it is
+ * impossible for the search to return any matches. We set all the field
+ * scoped required matches set to empty and stop examining any further
+ * clauses.
+ */
+ if (
+ expandedTerms.length === 0 &&
+ clause.presence === lunr.Query.presence.REQUIRED
+ ) {
+ for (var k = 0; k < clause.fields.length; k++) {
+ var field = clause.fields[k];
+ requiredMatches[field] = lunr.Set.empty;
+ }
+
+ break;
+ }
+
+ for (var j = 0; j < expandedTerms.length; j++) {
+ /*
+ * For each term get the posting and termIndex, this is required for
+ * building the query vector.
+ */
+ var expandedTerm = expandedTerms[j],
+ posting = this.invertedIndex[expandedTerm],
+ termIndex = posting._index;
+
+ for (var k = 0; k < clause.fields.length; k++) {
+ /*
+ * For each field that this query term is scoped by (by default
+ * all fields are in scope) we need to get all the document refs
+ * that have this term in that field.
+ *
+ * The posting is the entry in the invertedIndex for the matching
+ * term from above.
+ */
+ var field = clause.fields[k],
+ fieldPosting = posting[field],
+ matchingDocumentRefs = Object.keys(fieldPosting),
+ termField = expandedTerm + "/" + field,
+ matchingDocumentsSet = new lunr.Set(matchingDocumentRefs);
+
+ /*
+ * if the presence of this term is required ensure that the matching
+ * documents are added to the set of required matches for this clause.
+ *
+ */
+ if (clause.presence == lunr.Query.presence.REQUIRED) {
+ clauseMatches = clauseMatches.union(matchingDocumentsSet);
+
+ if (requiredMatches[field] === undefined) {
+ requiredMatches[field] = lunr.Set.complete;
+ }
+ }
+
+ /*
+ * if the presence of this term is prohibited ensure that the matching
+ * documents are added to the set of prohibited matches for this field,
+ * creating that set if it does not yet exist.
+ */
+ if (clause.presence == lunr.Query.presence.PROHIBITED) {
+ if (prohibitedMatches[field] === undefined) {
+ prohibitedMatches[field] = lunr.Set.empty;
+ }
+
+ prohibitedMatches[field] = prohibitedMatches[field].union(
+ matchingDocumentsSet
+ );
+
+ /*
+ * Prohibited matches should not be part of the query vector used for
+ * similarity scoring and no metadata should be extracted so we continue
+ * to the next field
+ */
+ continue;
+ }
+
+ /*
+ * The query field vector is populated using the termIndex found for
+ * the term and a unit value with the appropriate boost applied.
+ * Using upsert because there could already be an entry in the vector
+ * for the term we are working with. In that case we just add the scores
+ * together.
+ */
+ queryVectors[field].upsert(
+ termIndex,
+ clause.boost,
+ function (a, b) {
+ return a + b;
+ }
+ );
+
+ /**
+ * If we've already seen this term, field combo then we've already collected
+ * the matching documents and metadata, no need to go through all that again
+ */
+ if (termFieldCache[termField]) {
+ continue;
+ }
+
+ for (var l = 0; l < matchingDocumentRefs.length; l++) {
+ /*
+ * All metadata for this term/field/document triple
+ * are then extracted and collected into an instance
+ * of lunr.MatchData ready to be returned in the query
+ * results
+ */
+ var matchingDocumentRef = matchingDocumentRefs[l],
+ matchingFieldRef = new lunr.FieldRef(
+ matchingDocumentRef,
+ field
+ ),
+ metadata = fieldPosting[matchingDocumentRef],
+ fieldMatch;
+
+ if (
+ (fieldMatch = matchingFields[matchingFieldRef]) === undefined
+ ) {
+ matchingFields[matchingFieldRef] = new lunr.MatchData(
+ expandedTerm,
+ field,
+ metadata
+ );
+ } else {
+ fieldMatch.add(expandedTerm, field, metadata);
+ }
+ }
+
+ termFieldCache[termField] = true;
+ }
+ }
+ }
+
+ /**
+ * If the presence was required we need to update the requiredMatches field sets.
+ * We do this after all fields for the term have collected their matches because
+ * the clause terms presence is required in _any_ of the fields not _all_ of the
+ * fields.
+ */
+ if (clause.presence === lunr.Query.presence.REQUIRED) {
+ for (var k = 0; k < clause.fields.length; k++) {
+ var field = clause.fields[k];
+ requiredMatches[field] = requiredMatches[field].intersect(
+ clauseMatches
+ );
+ }
+ }
+ }
+
+ /**
+ * Need to combine the field scoped required and prohibited
+ * matching documents into a global set of required and prohibited
+ * matches
+ */
+ var allRequiredMatches = lunr.Set.complete,
+ allProhibitedMatches = lunr.Set.empty;
+
+ for (var i = 0; i < this.fields.length; i++) {
+ var field = this.fields[i];
+
+ if (requiredMatches[field]) {
+ allRequiredMatches = allRequiredMatches.intersect(
+ requiredMatches[field]
+ );
+ }
+
+ if (prohibitedMatches[field]) {
+ allProhibitedMatches = allProhibitedMatches.union(
+ prohibitedMatches[field]
+ );
+ }
+ }
+
+ var matchingFieldRefs = Object.keys(matchingFields),
+ results = [],
+ matches = Object.create(null);
+
+ /*
+ * If the query is negated (contains only prohibited terms)
+ * we need to get _all_ fieldRefs currently existing in the
+ * index. This is only done when we know that the query is
+ * entirely prohibited terms to avoid any cost of getting all
+ * fieldRefs unnecessarily.
+ *
+ * Additionally, blank MatchData must be created to correctly
+ * populate the results.
+ */
+ if (query.isNegated()) {
+ matchingFieldRefs = Object.keys(this.fieldVectors);
+
+ for (var i = 0; i < matchingFieldRefs.length; i++) {
+ var matchingFieldRef = matchingFieldRefs[i];
+ var fieldRef = lunr.FieldRef.fromString(matchingFieldRef);
+ matchingFields[matchingFieldRef] = new lunr.MatchData();
+ }
+ }
+
+ for (var i = 0; i < matchingFieldRefs.length; i++) {
+ /*
+ * Currently we have document fields that match the query, but we
+ * need to return documents. The matchData and scores are combined
+ * from multiple fields belonging to the same document.
+ *
+ * Scores are calculated by field, using the query vectors created
+ * above, and combined into a final document score using addition.
+ */
+ var fieldRef = lunr.FieldRef.fromString(matchingFieldRefs[i]),
+ docRef = fieldRef.docRef;
+
+ if (!allRequiredMatches.contains(docRef)) {
+ continue;
+ }
+
+ if (allProhibitedMatches.contains(docRef)) {
+ continue;
+ }
+
+ var fieldVector = this.fieldVectors[fieldRef],
+ score = queryVectors[fieldRef.fieldName].similarity(fieldVector),
+ docMatch;
+
+ if ((docMatch = matches[docRef]) !== undefined) {
+ docMatch.score += score;
+ docMatch.matchData.combine(matchingFields[fieldRef]);
+ } else {
+ var match = {
+ ref: docRef,
+ score: score,
+ matchData: matchingFields[fieldRef],
+ };
+ matches[docRef] = match;
+ results.push(match);
+ }
+ }
+
+ /*
+ * Sort the results objects by score, highest first.
+ */
+ return results.sort(function (a, b) {
+ return b.score - a.score;
+ });
+ };
+
+ /**
+ * Prepares the index for JSON serialization.
+ *
+ * The schema for this JSON blob will be described in a
+ * separate JSON schema file.
+ *
+ * @returns {Object}
+ */
+ lunr.Index.prototype.toJSON = function () {
+ var invertedIndex = Object.keys(this.invertedIndex)
+ .sort()
+ .map(function (term) {
+ return [term, this.invertedIndex[term]];
+ }, this);
+
+ var fieldVectors = Object.keys(this.fieldVectors).map(function (ref) {
+ return [ref, this.fieldVectors[ref].toJSON()];
+ }, this);
+
+ return {
+ version: lunr.version,
+ fields: this.fields,
+ fieldVectors: fieldVectors,
+ invertedIndex: invertedIndex,
+ pipeline: this.pipeline.toJSON(),
+ };
+ };
+
+ /**
+ * Loads a previously serialized lunr.Index
+ *
+ * @param {Object} serializedIndex - A previously serialized lunr.Index
+ * @returns {lunr.Index}
+ */
+ lunr.Index.load = function (serializedIndex) {
+ var attrs = {},
+ fieldVectors = {},
+ serializedVectors = serializedIndex.fieldVectors,
+ invertedIndex = {},
+ serializedInvertedIndex = serializedIndex.invertedIndex,
+ tokenSetBuilder = new lunr.TokenSet.Builder(),
+ pipeline = lunr.Pipeline.load(serializedIndex.pipeline);
+
+ if (serializedIndex.version != lunr.version) {
+ lunr.utils.warn(
+ "Version mismatch when loading serialised index. Current version of lunr '" +
+ lunr.version +
+ "' does not match serialized index '" +
+ serializedIndex.version +
+ "'"
+ );
+ }
+
+ for (var i = 0; i < serializedVectors.length; i++) {
+ var tuple = serializedVectors[i],
+ ref = tuple[0],
+ elements = tuple[1];
+
+ fieldVectors[ref] = new lunr.Vector(elements);
+ }
+
+ for (var i = 0; i < serializedInvertedIndex.length; i++) {
+ var tuple = serializedInvertedIndex[i],
+ term = tuple[0],
+ posting = tuple[1];
+
+ tokenSetBuilder.insert(term);
+ invertedIndex[term] = posting;
+ }
+
+ tokenSetBuilder.finish();
+
+ attrs.fields = serializedIndex.fields;
+
+ attrs.fieldVectors = fieldVectors;
+ attrs.invertedIndex = invertedIndex;
+ attrs.tokenSet = tokenSetBuilder.root;
+ attrs.pipeline = pipeline;
+
+ return new lunr.Index(attrs);
+ };
+ /*!
+ * lunr.Builder
+ * Copyright (C) 2018 Oliver Nightingale
+ */
+
+ /**
+ * lunr.Builder performs indexing on a set of documents and
+ * returns instances of lunr.Index ready for querying.
+ *
+ * All configuration of the index is done via the builder, the
+ * fields to index, the document reference, the text processing
+ * pipeline and document scoring parameters are all set on the
+ * builder before indexing.
+ *
+ * @constructor
+ * @property {string} _ref - Internal reference to the document reference field.
+ * @property {string[]} _fields - Internal reference to the document fields to index.
+ * @property {object} invertedIndex - The inverted index maps terms to document fields.
+ * @property {object} documentTermFrequencies - Keeps track of document term frequencies.
+ * @property {object} documentLengths - Keeps track of the length of documents added to the index.
+ * @property {lunr.tokenizer} tokenizer - Function for splitting strings into tokens for indexing.
+ * @property {lunr.Pipeline} pipeline - The pipeline performs text processing on tokens before indexing.
+ * @property {lunr.Pipeline} searchPipeline - A pipeline for processing search terms before querying the index.
+ * @property {number} documentCount - Keeps track of the total number of documents indexed.
+ * @property {number} _b - A parameter to control field length normalization, setting this to 0 disabled normalization, 1 fully normalizes field lengths, the default value is 0.75.
+ * @property {number} _k1 - A parameter to control how quickly an increase in term frequency results in term frequency saturation, the default value is 1.2.
+ * @property {number} termIndex - A counter incremented for each unique term, used to identify a terms position in the vector space.
+ * @property {array} metadataWhitelist - A list of metadata keys that have been whitelisted for entry in the index.
+ */
+ lunr.Builder = function () {
+ this._ref = "id";
+ this._fields = Object.create(null);
+ this._documents = Object.create(null);
+ this.invertedIndex = Object.create(null);
+ this.fieldTermFrequencies = {};
+ this.fieldLengths = {};
+ this.tokenizer = lunr.tokenizer;
+ this.pipeline = new lunr.Pipeline();
+ this.searchPipeline = new lunr.Pipeline();
+ this.documentCount = 0;
+ this._b = 0.75;
+ this._k1 = 1.2;
+ this.termIndex = 0;
+ this.metadataWhitelist = [];
+ };
+
+ /**
+ * Sets the document field used as the document reference. Every document must have this field.
+ * The type of this field in the document should be a string, if it is not a string it will be
+ * coerced into a string by calling toString.
+ *
+ * The default ref is 'id'.
+ *
+ * The ref should _not_ be changed during indexing, it should be set before any documents are
+ * added to the index. Changing it during indexing can lead to inconsistent results.
+ *
+ * @param {string} ref - The name of the reference field in the document.
+ */
+ lunr.Builder.prototype.ref = function (ref) {
+ this._ref = ref;
+ };
+
+ /**
+ * A function that is used to extract a field from a document.
+ *
+ * Lunr expects a field to be at the top level of a document, if however the field
+ * is deeply nested within a document an extractor function can be used to extract
+ * the right field for indexing.
+ *
+ * @callback fieldExtractor
+ * @param {object} doc - The document being added to the index.
+ * @returns {?(string|object|object[])} obj - The object that will be indexed for this field.
+ * @example
Extracting a nested field
+ * function (doc) { return doc.nested.field }
+ */
+
+ /**
+ * Adds a field to the list of document fields that will be indexed. Every document being
+ * indexed should have this field. Null values for this field in indexed documents will
+ * not cause errors but will limit the chance of that document being retrieved by searches.
+ *
+ * All fields should be added before adding documents to the index. Adding fields after
+ * a document has been indexed will have no effect on already indexed documents.
+ *
+ * Fields can be boosted at build time. This allows terms within that field to have more
+ * importance when ranking search results. Use a field boost to specify that matches within
+ * one field are more important than other fields.
+ *
+ * @param {string} fieldName - The name of a field to index in all documents.
+ * @param {object} attributes - Optional attributes associated with this field.
+ * @param {number} [attributes.boost=1] - Boost applied to all terms within this field.
+ * @param {fieldExtractor} [attributes.extractor] - Function to extract a field from a document.
+ * @throws {RangeError} fieldName cannot contain unsupported characters '/'
+ */
+ lunr.Builder.prototype.field = function (fieldName, attributes) {
+ if (/\//.test(fieldName)) {
+ throw new RangeError(
+ "Field '" + fieldName + "' contains illegal character '/'"
+ );
+ }
+
+ this._fields[fieldName] = attributes || {};
+ };
+
+ /**
+ * A parameter to tune the amount of field length normalisation that is applied when
+ * calculating relevance scores. A value of 0 will completely disable any normalisation
+ * and a value of 1 will fully normalise field lengths. The default is 0.75. Values of b
+ * will be clamped to the range 0 - 1.
+ *
+ * @param {number} number - The value to set for this tuning parameter.
+ */
+ lunr.Builder.prototype.b = function (number) {
+ if (number < 0) {
+ this._b = 0;
+ } else if (number > 1) {
+ this._b = 1;
+ } else {
+ this._b = number;
+ }
+ };
+
+ /**
+ * A parameter that controls the speed at which a rise in term frequency results in term
+ * frequency saturation. The default value is 1.2. Setting this to a higher value will give
+ * slower saturation levels, a lower value will result in quicker saturation.
+ *
+ * @param {number} number - The value to set for this tuning parameter.
+ */
+ lunr.Builder.prototype.k1 = function (number) {
+ this._k1 = number;
+ };
+
+ /**
+ * Adds a document to the index.
+ *
+ * Before adding fields to the index the index should have been fully setup, with the document
+ * ref and all fields to index already having been specified.
+ *
+ * The document must have a field name as specified by the ref (by default this is 'id') and
+ * it should have all fields defined for indexing, though null or undefined values will not
+ * cause errors.
+ *
+ * Entire documents can be boosted at build time. Applying a boost to a document indicates that
+ * this document should rank higher in search results than other documents.
+ *
+ * @param {object} doc - The document to add to the index.
+ * @param {object} attributes - Optional attributes associated with this document.
+ * @param {number} [attributes.boost=1] - Boost applied to all terms within this document.
+ */
+ lunr.Builder.prototype.add = function (doc, attributes) {
+ var docRef = doc[this._ref],
+ fields = Object.keys(this._fields);
+
+ this._documents[docRef] = attributes || {};
+ this.documentCount += 1;
+
+ for (var i = 0; i < fields.length; i++) {
+ var fieldName = fields[i],
+ extractor = this._fields[fieldName].extractor,
+ field = extractor ? extractor(doc) : doc[fieldName],
+ tokens = this.tokenizer(field, {
+ fields: [fieldName],
+ }),
+ terms = this.pipeline.run(tokens),
+ fieldRef = new lunr.FieldRef(docRef, fieldName),
+ fieldTerms = Object.create(null);
+
+ this.fieldTermFrequencies[fieldRef] = fieldTerms;
+ this.fieldLengths[fieldRef] = 0;
+
+ // store the length of this field for this document
+ this.fieldLengths[fieldRef] += terms.length;
+
+ // calculate term frequencies for this field
+ for (var j = 0; j < terms.length; j++) {
+ var term = terms[j];
+
+ if (fieldTerms[term] == undefined) {
+ fieldTerms[term] = 0;
+ }
+
+ fieldTerms[term] += 1;
+
+ // add to inverted index
+ // create an initial posting if one doesn't exist
+ if (this.invertedIndex[term] == undefined) {
+ var posting = Object.create(null);
+ posting["_index"] = this.termIndex;
+ this.termIndex += 1;
+
+ for (var k = 0; k < fields.length; k++) {
+ posting[fields[k]] = Object.create(null);
+ }
+
+ this.invertedIndex[term] = posting;
+ }
+
+ // add an entry for this term/fieldName/docRef to the invertedIndex
+ if (this.invertedIndex[term][fieldName][docRef] == undefined) {
+ this.invertedIndex[term][fieldName][docRef] = Object.create(null);
+ }
+
+ // store all whitelisted metadata about this token in the
+ // inverted index
+ for (var l = 0; l < this.metadataWhitelist.length; l++) {
+ var metadataKey = this.metadataWhitelist[l],
+ metadata = term.metadata[metadataKey];
+
+ if (
+ this.invertedIndex[term][fieldName][docRef][metadataKey] ==
+ undefined
+ ) {
+ this.invertedIndex[term][fieldName][docRef][metadataKey] = [];
+ }
+
+ this.invertedIndex[term][fieldName][docRef][metadataKey].push(
+ metadata
+ );
+ }
+ }
+ }
+ };
+
+ /**
+ * Calculates the average document length for this index
+ *
+ * @private
+ */
+ lunr.Builder.prototype.calculateAverageFieldLengths = function () {
+ var fieldRefs = Object.keys(this.fieldLengths),
+ numberOfFields = fieldRefs.length,
+ accumulator = {},
+ documentsWithField = {};
+
+ for (var i = 0; i < numberOfFields; i++) {
+ var fieldRef = lunr.FieldRef.fromString(fieldRefs[i]),
+ field = fieldRef.fieldName;
+
+ documentsWithField[field] || (documentsWithField[field] = 0);
+ documentsWithField[field] += 1;
+
+ accumulator[field] || (accumulator[field] = 0);
+ accumulator[field] += this.fieldLengths[fieldRef];
+ }
+
+ var fields = Object.keys(this._fields);
+
+ for (var i = 0; i < fields.length; i++) {
+ var fieldName = fields[i];
+ accumulator[fieldName] =
+ accumulator[fieldName] / documentsWithField[fieldName];
+ }
+
+ this.averageFieldLength = accumulator;
+ };
+
+ /**
+ * Builds a vector space model of every document using lunr.Vector
+ *
+ * @private
+ */
+ lunr.Builder.prototype.createFieldVectors = function () {
+ var fieldVectors = {},
+ fieldRefs = Object.keys(this.fieldTermFrequencies),
+ fieldRefsLength = fieldRefs.length,
+ termIdfCache = Object.create(null);
+
+ for (var i = 0; i < fieldRefsLength; i++) {
+ var fieldRef = lunr.FieldRef.fromString(fieldRefs[i]),
+ fieldName = fieldRef.fieldName,
+ fieldLength = this.fieldLengths[fieldRef],
+ fieldVector = new lunr.Vector(),
+ termFrequencies = this.fieldTermFrequencies[fieldRef],
+ terms = Object.keys(termFrequencies),
+ termsLength = terms.length;
+
+ var fieldBoost = this._fields[fieldName].boost || 1,
+ docBoost = this._documents[fieldRef.docRef].boost || 1;
+
+ for (var j = 0; j < termsLength; j++) {
+ var term = terms[j],
+ tf = termFrequencies[term],
+ termIndex = this.invertedIndex[term]._index,
+ idf,
+ score,
+ scoreWithPrecision;
+
+ if (termIdfCache[term] === undefined) {
+ idf = lunr.idf(this.invertedIndex[term], this.documentCount);
+ termIdfCache[term] = idf;
+ } else {
+ idf = termIdfCache[term];
+ }
+
+ score =
+ (idf * ((this._k1 + 1) * tf)) /
+ (this._k1 *
+ (1 -
+ this._b +
+ this._b * (fieldLength / this.averageFieldLength[fieldName])) +
+ tf);
+ score *= fieldBoost;
+ score *= docBoost;
+ scoreWithPrecision = Math.round(score * 1000) / 1000;
+ // Converts 1.23456789 to 1.234.
+ // Reducing the precision so that the vectors take up less
+ // space when serialised. Doing it now so that they behave
+ // the same before and after serialisation. Also, this is
+ // the fastest approach to reducing a number's precision in
+ // JavaScript.
+
+ fieldVector.insert(termIndex, scoreWithPrecision);
+ }
+
+ fieldVectors[fieldRef] = fieldVector;
+ }
+
+ this.fieldVectors = fieldVectors;
+ };
+
+ /**
+ * Creates a token set of all tokens in the index using lunr.TokenSet
+ *
+ * @private
+ */
+ lunr.Builder.prototype.createTokenSet = function () {
+ this.tokenSet = lunr.TokenSet.fromArray(
+ Object.keys(this.invertedIndex).sort()
+ );
+ };
+
+ /**
+ * Builds the index, creating an instance of lunr.Index.
+ *
+ * This completes the indexing process and should only be called
+ * once all documents have been added to the index.
+ *
+ * @returns {lunr.Index}
+ */
+ lunr.Builder.prototype.build = function () {
+ this.calculateAverageFieldLengths();
+ this.createFieldVectors();
+ this.createTokenSet();
+
+ return new lunr.Index({
+ invertedIndex: this.invertedIndex,
+ fieldVectors: this.fieldVectors,
+ tokenSet: this.tokenSet,
+ fields: Object.keys(this._fields),
+ pipeline: this.searchPipeline,
+ });
+ };
+
+ /**
+ * Applies a plugin to the index builder.
+ *
+ * A plugin is a function that is called with the index builder as its context.
+ * Plugins can be used to customise or extend the behaviour of the index
+ * in some way. A plugin is just a function, that encapsulated the custom
+ * behaviour that should be applied when building the index.
+ *
+ * The plugin function will be called with the index builder as its argument, additional
+ * arguments can also be passed when calling use. The function will be called
+ * with the index builder as its context.
+ *
+ * @param {Function} plugin The plugin to apply.
+ */
+ lunr.Builder.prototype.use = function (fn) {
+ var args = Array.prototype.slice.call(arguments, 1);
+ args.unshift(this);
+ fn.apply(this, args);
+ };
+ /**
+ * Contains and collects metadata about a matching document.
+ * A single instance of lunr.MatchData is returned as part of every
+ * lunr.Index~Result.
+ *
+ * @constructor
+ * @param {string} term - The term this match data is associated with
+ * @param {string} field - The field in which the term was found
+ * @param {object} metadata - The metadata recorded about this term in this field
+ * @property {object} metadata - A cloned collection of metadata associated with this document.
+ * @see {@link lunr.Index~Result}
+ */
+ lunr.MatchData = function (term, field, metadata) {
+ var clonedMetadata = Object.create(null),
+ metadataKeys = Object.keys(metadata || {});
+
+ // Cloning the metadata to prevent the original
+ // being mutated during match data combination.
+ // Metadata is kept in an array within the inverted
+ // index so cloning the data can be done with
+ // Array#slice
+ for (var i = 0; i < metadataKeys.length; i++) {
+ var key = metadataKeys[i];
+ clonedMetadata[key] = metadata[key].slice();
+ }
+
+ this.metadata = Object.create(null);
+
+ if (term !== undefined) {
+ this.metadata[term] = Object.create(null);
+ this.metadata[term][field] = clonedMetadata;
+ }
+ };
+
+ /**
+ * An instance of lunr.MatchData will be created for every term that matches a
+ * document. However only one instance is required in a lunr.Index~Result. This
+ * method combines metadata from another instance of lunr.MatchData with this
+ * objects metadata.
+ *
+ * @param {lunr.MatchData} otherMatchData - Another instance of match data to merge with this one.
+ * @see {@link lunr.Index~Result}
+ */
+ lunr.MatchData.prototype.combine = function (otherMatchData) {
+ var terms = Object.keys(otherMatchData.metadata);
+
+ for (var i = 0; i < terms.length; i++) {
+ var term = terms[i],
+ fields = Object.keys(otherMatchData.metadata[term]);
+
+ if (this.metadata[term] == undefined) {
+ this.metadata[term] = Object.create(null);
+ }
+
+ for (var j = 0; j < fields.length; j++) {
+ var field = fields[j],
+ keys = Object.keys(otherMatchData.metadata[term][field]);
+
+ if (this.metadata[term][field] == undefined) {
+ this.metadata[term][field] = Object.create(null);
+ }
+
+ for (var k = 0; k < keys.length; k++) {
+ var key = keys[k];
+
+ if (this.metadata[term][field][key] == undefined) {
+ this.metadata[term][field][key] =
+ otherMatchData.metadata[term][field][key];
+ } else {
+ this.metadata[term][field][key] = this.metadata[term][field][
+ key
+ ].concat(otherMatchData.metadata[term][field][key]);
+ }
+ }
+ }
+ }
+ };
+
+ /**
+ * Add metadata for a term/field pair to this instance of match data.
+ *
+ * @param {string} term - The term this match data is associated with
+ * @param {string} field - The field in which the term was found
+ * @param {object} metadata - The metadata recorded about this term in this field
+ */
+ lunr.MatchData.prototype.add = function (term, field, metadata) {
+ if (!(term in this.metadata)) {
+ this.metadata[term] = Object.create(null);
+ this.metadata[term][field] = metadata;
+ return;
+ }
+
+ if (!(field in this.metadata[term])) {
+ this.metadata[term][field] = metadata;
+ return;
+ }
+
+ var metadataKeys = Object.keys(metadata);
+
+ for (var i = 0; i < metadataKeys.length; i++) {
+ var key = metadataKeys[i];
+
+ if (key in this.metadata[term][field]) {
+ this.metadata[term][field][key] = this.metadata[term][field][
+ key
+ ].concat(metadata[key]);
+ } else {
+ this.metadata[term][field][key] = metadata[key];
+ }
+ }
+ };
+ /**
+ * A lunr.Query provides a programmatic way of defining queries to be performed
+ * against a {@link lunr.Index}.
+ *
+ * Prefer constructing a lunr.Query using the {@link lunr.Index#query} method
+ * so the query object is pre-initialized with the right index fields.
+ *
+ * @constructor
+ * @property {lunr.Query~Clause[]} clauses - An array of query clauses.
+ * @property {string[]} allFields - An array of all available fields in a lunr.Index.
+ */
+ lunr.Query = function (allFields) {
+ this.clauses = [];
+ this.allFields = allFields;
+ };
+
+ /**
+ * Constants for indicating what kind of automatic wildcard insertion will be used when constructing a query clause.
+ *
+ * This allows wildcards to be added to the beginning and end of a term without having to manually do any string
+ * concatenation.
+ *
+ * The wildcard constants can be bitwise combined to select both leading and trailing wildcards.
+ *
+ * @constant
+ * @default
+ * @property {number} wildcard.NONE - The term will have no wildcards inserted, this is the default behaviour
+ * @property {number} wildcard.LEADING - Prepend the term with a wildcard, unless a leading wildcard already exists
+ * @property {number} wildcard.TRAILING - Append a wildcard to the term, unless a trailing wildcard already exists
+ * @see lunr.Query~Clause
+ * @see lunr.Query#clause
+ * @see lunr.Query#term
+ * @example
+ * query.term('foo', {
+ * wildcard: lunr.Query.wildcard.LEADING | lunr.Query.wildcard.TRAILING
+ * })
+ */
+
+ lunr.Query.wildcard = new String("*");
+ lunr.Query.wildcard.NONE = 0;
+ lunr.Query.wildcard.LEADING = 1;
+ lunr.Query.wildcard.TRAILING = 2;
+
+ /**
+ * Constants for indicating what kind of presence a term must have in matching documents.
+ *
+ * @constant
+ * @enum {number}
+ * @see lunr.Query~Clause
+ * @see lunr.Query#clause
+ * @see lunr.Query#term
+ * @example
query term with required presence
+ * query.term('foo', { presence: lunr.Query.presence.REQUIRED })
+ */
+ lunr.Query.presence = {
+ /**
+ * Term's presence in a document is optional, this is the default value.
+ */
+ OPTIONAL: 1,
+
+ /**
+ * Term's presence in a document is required, documents that do not contain
+ * this term will not be returned.
+ */
+ REQUIRED: 2,
+
+ /**
+ * Term's presence in a document is prohibited, documents that do contain
+ * this term will not be returned.
+ */
+ PROHIBITED: 3,
+ };
+
+ /**
+ * A single clause in a {@link lunr.Query} contains a term and details on how to
+ * match that term against a {@link lunr.Index}.
+ *
+ * @typedef {Object} lunr.Query~Clause
+ * @property {string[]} fields - The fields in an index this clause should be matched against.
+ * @property {number} [boost=1] - Any boost that should be applied when matching this clause.
+ * @property {number} [editDistance] - Whether the term should have fuzzy matching applied, and how fuzzy the match should be.
+ * @property {boolean} [usePipeline] - Whether the term should be passed through the search pipeline.
+ * @property {number} [wildcard=lunr.Query.wildcard.NONE] - Whether the term should have wildcards appended or prepended.
+ * @property {number} [presence=lunr.Query.presence.OPTIONAL] - The terms presence in any matching documents.
+ */
+
+ /**
+ * Adds a {@link lunr.Query~Clause} to this query.
+ *
+ * Unless the clause contains the fields to be matched all fields will be matched. In addition
+ * a default boost of 1 is applied to the clause.
+ *
+ * @param {lunr.Query~Clause} clause - The clause to add to this query.
+ * @see lunr.Query~Clause
+ * @returns {lunr.Query}
+ */
+ lunr.Query.prototype.clause = function (clause) {
+ if (!("fields" in clause)) {
+ clause.fields = this.allFields;
+ }
+
+ if (!("boost" in clause)) {
+ clause.boost = 1;
+ }
+
+ if (!("usePipeline" in clause)) {
+ clause.usePipeline = true;
+ }
+
+ if (!("wildcard" in clause)) {
+ clause.wildcard = lunr.Query.wildcard.NONE;
+ }
+
+ if (
+ clause.wildcard & lunr.Query.wildcard.LEADING &&
+ clause.term.charAt(0) != lunr.Query.wildcard
+ ) {
+ clause.term = "*" + clause.term;
+ }
+
+ if (
+ clause.wildcard & lunr.Query.wildcard.TRAILING &&
+ clause.term.slice(-1) != lunr.Query.wildcard
+ ) {
+ clause.term = "" + clause.term + "*";
+ }
+
+ if (!("presence" in clause)) {
+ clause.presence = lunr.Query.presence.OPTIONAL;
+ }
+
+ this.clauses.push(clause);
+
+ return this;
+ };
+
+ /**
+ * A negated query is one in which every clause has a presence of
+ * prohibited. These queries require some special processing to return
+ * the expected results.
+ *
+ * @returns boolean
+ */
+ lunr.Query.prototype.isNegated = function () {
+ for (var i = 0; i < this.clauses.length; i++) {
+ if (this.clauses[i].presence != lunr.Query.presence.PROHIBITED) {
+ return false;
+ }
+ }
+
+ return true;
+ };
+
+ /**
+ * Adds a term to the current query, under the covers this will create a {@link lunr.Query~Clause}
+ * to the list of clauses that make up this query.
+ *
+ * The term is used as is, i.e. no tokenization will be performed by this method. Instead conversion
+ * to a token or token-like string should be done before calling this method.
+ *
+ * The term will be converted to a string by calling `toString`. Multiple terms can be passed as an
+ * array, each term in the array will share the same options.
+ *
+ * @param {object|object[]} term - The term(s) to add to the query.
+ * @param {object} [options] - Any additional properties to add to the query clause.
+ * @returns {lunr.Query}
+ * @see lunr.Query#clause
+ * @see lunr.Query~Clause
+ * @example
adding a single term to a query
+ * query.term("foo")
+ * @example
adding a single term to a query and specifying search fields, term boost and automatic trailing wildcard
The Register; Raise some horns: Red Hat’s Metal³ aims to make Kubernetes on bare machines simple
+
+
+
+
Max Smolaks talks in
+this article about the OpenInfra Days in the UK, 2019: where Metal³ was
+revealed earlier last week by Steve Hardy, Red Hat’s senior principal
+software engineer. The Open Infrastructure Days in the UK is an event
+organized by the local Open Infrastructure community and supported by
+the OpenStack Foundation. The Open-source software developers at Red Hat
+are working on a tool that would simplify the deployment and management
+of Kubernetes clusters on bare-metal servers.
+
+
Steve told The Register:
+
+
+
“In some situations, you won’t want to run a full OpenStack
+infrastructure-as-a-service layer to provide, potentially, for
+multiple Kubernetes clusters”.
+
+
+
Hardy is a notable contributor to OpenStack, having previously worked on
+Heat and TripleO projects. He said one of the reasons for choosing
+Ironic was its active development – and when new features get added to
+Ironic, the Metal³ team gets them “for free”.
+
+
+
“OpenStack has always been a modular set of projects, and people have
+always had the opportunity to reuse components for different
+applications. This is just an example of where we are leveraging one
+particular component for infrastructure management, just as an
+alternative to using a full infrastructure API,” Hardy said.
+
+
+
Thierry Carrez, veep of engineering at the OpenStack Foundation also told
+The Register:
+
+
+
“I like the fact that the projects end up being reusable on their own,
+for the functions they bring to the table – this helps us integrate
+with adjacent communities”.
+
+
+
Hardy also commented:
+
+
+
It’s still early days for Metal³ - the project has just six
+contributors, and there’s no telling when it might reach release.
+“It’s a very, very young project but we are keen to get more community
+participation and feedback,”.
There are a number of great open-source tools for bare metal host provisioning, including Ironic. Metal³ aims to build on these technologies to provide a Kubernetes native API for managing bare metal hosts via a provisioning stack that is also running on Kubernetes. We believe that Kubernetes Native Infrastructure, or managing your infrastructure just like your applications, is a powerful next step in the evolution of infrastructure management.
+
+
The Metal³ project is also building integration with the Kubernetes cluster-api project, allowing Metal³ to be used as an infrastructure backend for Machine objects from the Cluster API.
+
+
Metal3 Repository Overview
+
+
There is a Metal³ overview and some more detailed design documents in the metal3-docs repository.
+
+
The baremetal-operator is the component that manages bare metal hosts. It exposes a new BareMetalHost custom resource in the Kubernetes API that lets you manage hosts in a declarative way.
+
+
Finally, the cluster-api-provider-baremetal repository includes integration with the cluster-api project. This provider currently includes a Machine actuator that acts as a client of the BareMetalHost custom resources.
+
+
Demo
+
+
The project has been going on for a few months now, and there’s enough now to show some working code.
+
+
For this demonstration, I’ve started with a 3-node Kubernetes cluster installed using OpenShift.
+
+
$ kubectl get nodes
+NAME STATUS ROLES AGE VERSION
+master-0 Ready master 24h v1.13.4+d4ce02c1d
+master-1 Ready master 24h v1.13.4+d4ce02c1d
+master-2 Ready master 24h v1.13.4+d4ce02c1d
+
+
+
Machine objects were created to reflect these 3 masters, as well.
+
+
$ kubectl get machines
+NAME INSTANCE STATE TYPE REGION ZONE AGE
+ostest-master-0 24h
+ostest-master-1 24h
+ostest-master-2 24h
+
+
+
For this cluster-api provider, a Machine has a corresponding BareMetalHost object, which corresponds to the piece of hardware we are managing. There is a design document that covers the relationship between Nodes, Machines, and BareMetalHosts.
+
+
Since these hosts were provisioned earlier, they are in a special externally provisioned state, indicating that we enrolled them in management while they were already running in a desired state. If changes are needed going forward, the baremetal-operator will be able to automate them.
+
+
$ kubectl get baremetalhosts
+NAME STATUS PROVISIONING STATUS MACHINE BMC HARDWARE PROFILE ONLINE ERROR
+openshift-master-0 OK externally provisioned ostest-master-0 ipmi://192.168.111.1:6230 true
+openshift-master-1 OK externally provisioned ostest-master-1 ipmi://192.168.111.1:6231 true
+openshift-master-2 OK externally provisioned ostest-master-2 ipmi://192.168.111.1:6232 true
+
+
+
Now suppose we’d like to expand this cluster by adding another bare metal host to serve as a worker node. First, we need to create a new BareMetalHost object that adds this new host to the inventory of hosts managed by the baremetal-operator. Here’s the YAML for the new BareMetalHost:
Now to add the BareMetalHost and its IPMI credentials Secret to the cluster:
+
+
$ kubectl create -f worker_crs.yaml
+secret/openshift-worker-0-bmc-secret created
+baremetalhost.metalkube.org/openshift-worker-0 created
+
+
+
The list of BareMetalHosts now reflects a new host in the inventory that is ready to be provisioned. It will remain in this ready state until it is claimed by a new Machine object.
+
+
$ kubectl get baremetalhosts
+NAME STATUS PROVISIONING STATUS MACHINE BMC HARDWARE PROFILE ONLINE ERROR
+openshift-master-0 OK externally provisioned ostest-master-0 ipmi://192.168.111.1:6230 true
+openshift-master-1 OK externally provisioned ostest-master-1 ipmi://192.168.111.1:6231 true
+openshift-master-2 OK externally provisioned ostest-master-2 ipmi://192.168.111.1:6232 true
+openshift-worker-0 OK ready ipmi://192.168.111.1:6233 unknown true
+
+
+
We have a MachineSet already created for workers, but it scaled down to 0.
+
+
$ kubectl get machinesets
+NAME DESIRED CURRENT READY AVAILABLE AGE
+ostest-worker-0 0 0 24h
+
+
+
We can scale this MachineSet to 1 to indicate that we’d like a worker provisioned. The baremetal cluster-api provider will then look for an available BareMetalHost, claim it, and trigger provisioning of that host.
After the new Machine was created, our cluster-api provider claimed the available host and triggered it to be provisioned.
+
+
$ kubectl get baremetalhosts
+NAME STATUS PROVISIONING STATUS MACHINE BMC HARDWARE PROFILE ONLINE ERROR
+openshift-master-0 OK externally provisioned ostest-master-0 ipmi://192.168.111.1:6230 true
+openshift-master-1 OK externally provisioned ostest-master-1 ipmi://192.168.111.1:6231 true
+openshift-master-2 OK externally provisioned ostest-master-2 ipmi://192.168.111.1:6232 true
+openshift-worker-0 OK provisioning ostest-worker-0-jmhtc ipmi://192.168.111.1:6233 unknown true
+
+
+
This process takes some time. Under the hood, the baremetal-operator is driving Ironic through a provisioning process. This begins with wiping disks to ensure the host comes up in a clean state. It will eventually write the desired OS image to disk and then reboot into that OS. When complete, a new Kubernetes Node will register with the cluster.
+
+
$ kubectl get baremetalhosts
+NAME STATUS PROVISIONING STATUS MACHINE BMC HARDWARE PROFILE ONLINE ERROR
+openshift-master-0 OK externally provisioned ostest-master-0 ipmi://192.168.111.1:6230 true
+openshift-master-1 OK externally provisioned ostest-master-1 ipmi://192.168.111.1:6231 true
+openshift-master-2 OK externally provisioned ostest-master-2 ipmi://192.168.111.1:6232 true
+openshift-worker-0 OK provisioned ostest-worker-0-jmhtc ipmi://192.168.111.1:6233 unknown true
+
+
+$ kubectl get nodes
+NAME STATUS ROLES AGE VERSION
+master-0 Ready master 24h v1.13.4+d4ce02c1d
+master-1 Ready master 24h v1.13.4+d4ce02c1d
+master-2 Ready master 24h v1.13.4+d4ce02c1d
+worker-0 Ready worker 68s v1.13.4+d4ce02c1d
+
+
+
The following screen cast demonstrates this process, as well:
+
+
+
+
Removing a bare metal host from the cluster is very similar. We just have to scale this MachineSet back down to 0.
Once the Machine has been deleted, the baremetal-operator will deprovision the bare metal host.
+
+
$ kubectl get baremetalhosts
+NAME STATUS PROVISIONING STATUS MACHINE BMC HARDWARE PROFILE ONLINE ERROR
+openshift-master-0 OK externally provisioned ostest-master-0 ipmi://192.168.111.1:6230 true
+openshift-master-1 OK externally provisioned ostest-master-1 ipmi://192.168.111.1:6231 true
+openshift-master-2 OK externally provisioned ostest-master-2 ipmi://192.168.111.1:6232 true
+openshift-worker-0 OK deprovisioning ipmi://192.168.111.1:6233 unknown false
+
+
+
Once the deprovisioning process is complete, the bare metal host will be back to its ready state, available in the host inventory to be claimed by a future Machine object.
+
+
$ kubectl get baremetalhosts
+NAME STATUS PROVISIONING STATUS MACHINE BMC HARDWARE PROFILE ONLINE ERROR
+openshift-master-0 OK externally provisioned ostest-master-0 ipmi://192.168.111.1:6230 true
+openshift-master-1 OK externally provisioned ostest-master-1 ipmi://192.168.111.1:6231 true
+openshift-master-2 OK externally provisioned ostest-master-2 ipmi://192.168.111.1:6232 true
+openshift-worker-0 OK ready ipmi://192.168.111.1:6233 unknown false
+
The new stack Metal³ Uses OpenStack’s Ironic for Declarative Bare Metal Kubernetes
+
+
Mike Melanson talks in this article about the Open Infrastructure Summit in Denver, Colorado. Where bare metal was one of the main leads of the event.
+
+
During this event, the OpenStack Foundation unveil a new project called Metal³ (pronounced “metal cubed”) that uses Ironic “as a foundation for declarative management of bare metal infrastructure for Kubernetes”.
+He also comments on how James Penick, Chris Hoge, senior strategic program manager at OpenStack Foundation,
+and Julia Kreger, OpenStack Ironic Project Team Leader, took to the stage to offer a demonstration of Metal3,
+the new project that provides “bare metal host provisioning integration for Kubernetes.”
+
+
Some words from Kreger in an interview with The New Stack:
+
+
+
“I think the bigger trend that we’re starting to see is a recognition that common tooling and substrate helps everyone succeed faster with more efficiency.”
+
+
“This is combined with a shift in the way operators are choosing to solve their problems at scale, specifically in regards to isolation, cost, or performance.”
+
+
+
For further detail, check out the video of the keynote, which includes a demonstration of Metal3 being used to quickly provision three bare metal servers with Kubernetes
+or check the full article included below.
In this blog post, I’m going to try to explain in my own words a high level
+overview of what Metal3 is, the motivation behind it and some concepts related
+to a ‘baremetal operator’.
+
+
Let’s have some definitions!
+
+
Custom Resource Definition
+
+
The k8s API provides out-of-the-box objects such as pods, services, etc.
+There are a few methods of extending the k8s API (such as API extensions)
+but since a few releases back, the k8s API can be extended easily with custom resources definitions (CRDs).
+Basically, this means you can virtually create any type of object definition in k8s
+(actually only users with cluster-admin capabilities) with a yaml such as:
+
+
apiVersion:apiextensions.k8s.io/v1beta1
+kind:CustomResourceDefinition
+metadata:
+ # name must match the spec fields below, and be in the form: <plural>.<group>
+ name:crontabs.stable.example.com
+spec:
+ # group name to use for REST API: /apis/<group>/<version>
+ group:stable.example.com
+ # list of versions supported by this CustomResourceDefinition
+ versions:
+ -name:v1
+ # Each version can be enabled/disabled by Served flag.
+ served:true
+ # One and only one version must be marked as the storage version.
+ storage:true
+ # either Namespaced or Cluster
+ scope:Namespaced
+ names:
+ # plural name to be used in the URL: /apis/<group>/<version>/<plural>
+ plural:crontabs
+ # singular name to be used as an alias on the CLI and for display
+ singular:crontab
+ # kind is normally the CamelCased singular type. Your resource manifests use this.
+ kind:CronTab
+ # shortNames allow shorter string to match your resource on the CLI
+ shortNames:
+ -ct
+ preserveUnknownFields:false
+ validation:
+ openAPIV3Schema:
+ type:object
+ properties:
+ spec:
+ type:object
+ properties:
+ cronSpec:
+ type:string
+ image:
+ type:string
+ replicas:
+ type:integer
+
+
+
And after kubectl apply -f you can kubectl get crontabs.
The CRD by himself is not useful per se as nobody will take care of it (that’s why I said definition). It
+requires a controller to watch for those new objects and react to different
+events affecting the object.
+
+
Controller
+
+
A controller is basically a loop that watches the current status of an object
+and if it is different from the desired status, it fixes it (reconciliation).
+This is why k8s is ‘declarative’, you specify the object desired status instead
+‘how to do it’ (imperative).
+
+
Again, there are tons of documentation (and examples) around the controller pattern which is
+basically the k8s roots, so I’ll let your google-foo take care of it :)
+
+
Operator
+
+
An Operator (in k8s slang) is an application running in your k8s
+cluster that deploys, manages and maintains (so, operates) a k8s application.
+
+
This k8s application (the one that the operator manages), can be as simple as a ‘hello world’ application
+containerized and deployed in your k8s cluster or it can be a much more complex
+thing, such as a database cluster.
+
+
The ‘operator’ is like an ‘expert sysadmin’ containerized that takes care of
+your application.
+
+
Bear in mind that the ‘expert’ tag (meaning the automation behind the operator)
+depends on the operator implementation… so there can be basic operators that
+only deploy your application or complex operators that handle day 2 operations
+such as upgrades, failovers, backup/restore, etc.
k8s code is smart enough to be able to leverage
+the underlying infrastructure where the cluster is running, such as being able
+of creating ‘LoadBalancer’ services, understanding the cluster topology based on the cloud provider AZs where the nodes are running (for scheduling reasons), etc.
+
+
This task of ‘talking to the cloud provider’ is performed by the Cloud Controller Manager (CCM) and for more
+information, you can take a look at the official k8s documentation with
+regards the architecture and the administration (also, if you are brave enough, you can create your own cloud controller manager )
+
+
Cluster API
+
+
The Cluster API implementation is a WIP ‘framework’ that allows a k8s cluster to manage itself, including the ability to create new clusters, add more nodes, etc. in a ‘k8s way’ (declarative, controllers, CRDs, etc.), so there are objects such as Cluster that can be expressed as k8s objects:
There are some
+provider implementations in the wild such as AWS, Azure, GCP, OpenStack,
+vSphere, etc. ones and the Cluster API project is driven by the SIG Cluster Lifecycle.
+
+
Please review the official Cluster API repository for more information.
+
+
Actuator
+
+
The actuator is a Cluster API interface that reacts to changes to Machine
+objects reconciliating the Machine status.
+
+
The actuator code is tightly coupled with the provider (that’s why it is an
+interface) such as the AWS one.
+
+
MachineSet vs Machine
+
+
To simplify, let’s say that MachineSets are to Machines what ReplicaSets are
+to Pods. So you can scale the Machines in your cluster just by changing
+the number of replicas of a MachineSet.
+
+
Cluster API vs Cloud Providers
+
+
As we have seen, the Cluster API leverages the provider related to the k8s
+infrastructure itself (clusters and nodes) and the CCM and the cloud provider
+integration for k8s is to leverage the cloud provider to provide support infrastructure.
+
+
Let’s say Cluster API is for the k8s administrators and the
+CCM is for the k8s users :)
+
+
Machine API
+
+
The OpenShift 4 Machine API is a combination of some of the upstream Cluster API
+with custom OpenShift resources and it is designed to work in conjunction with
+the Cluster Version Operator.
+
+
OpenShift’s Machine API Operator
+
+
The machine-api-operator is
+an operator that manages the Machine API objects in an OpenShift 4 cluster.
A baremetal server (or bare-metal) is just a computer server.
+
+
The last year’s terms such as virtualization, containers, serverless, etc. have been
+popular but at the end of the day, all the code running on top of a SaaS, PaaS
+or IaaS is actually running in a real physical server stored in a datacenter
+wired to routers, switches and power. That server is a ‘baremetal’ server.
+
+
If you are used to cloud providers and instances, you probably don’t know the
+pains of baremetal management… including things such as connecting to the
+virtual console (usually it requires an old Java version) to debug issues,
+configuring pxe for provisioning baremetal hosts (or attach ISOs via the virtual console… or insert a CD/DVD physically into the CD carry if you are
+‘lucky’ enough…), configuring VLANs for traffic isolation, etc.
+
+
That kind of operation is not ‘cloud’ ready and there are tools that provide
+baremetal management, such as maas or ironic.
+
+
Ironic
+
+
OpenStack bare metal provisioning (or ironic) is an open source project (or even better, a number of open source projects) to manage baremetal hosts. Ironic avoids the administrator dealing with pxe configuration, manual deployments, etc. and provides a defined API and a series of plugins to interact with different baremetal models and vendors.
+
+
Ironic is used in OpenStack to provide baremetal objects but there are some
+projects (such as bifrost) to use
+Ironic ‘standalone’ (so, no OpenStack required)
+
+
Metal3
+
+
Metal3 is a project aimed at providing a baremetal operator that
+implements the Cluster API framework required to be able to manage baremetal
+in a k8s way (easy peasy!). It uses ironic under the hood to avoid reinventing the
+wheel, but consider it as an implementation detail that may change.
+
+
The Metal3 baremetal operator watches for BareMetalHost (CRD) objects defined as:
There are a few more fields in the BareMetalHost object such as the image, hardware profile, etc.
+
+
The Metal3 project is actually divided into two different components:
+
+
baremetal-operator
+
+
The Metal3 baremetal-operator is the component that manages baremetal hosts. It exposes a new BareMetalHost custom resource in the k8s API that lets you manage hosts in a declarative way.
+
+
cluster-api-provider-baremetal
+
+
The Metal3 cluster-api-provider-baremetal includes the integration with the Cluster API project. This provider currently includes a Machine actuator that acts as a client of the BareMetalHost custom resources.
+
+
BareMetalHost vs Machine vs Node
+
+
+
BareMetalHost is a Metal3 object
+
Machine is a Cluster API object
+
Node is where the pods run :)
+
+
+
Those three concepts are linked in a 1:1:1 relationship meaning:
+
+
A BareMetalHost created with Metal3 maps to a Machine object and once the
+installation procedure finishes, a new kubernetes node will be added to the
+cluster.
+
+
$ kubectl get nodes
+NAME STATUS ROLES AGE VERSION
+my-node-0.example.com Ready master 25h v1.14.0
+
+
+$ kubectl get machines --all-namespaces
+NAMESPACE NAME INSTANCE STATE TYPE REGION ZONE AGE
+openshift-machine-api my-node-0 25h
+
+
+$ kubectl get baremetalhosts --allnamespaces
+NAMESPACE NAME STATUS PROVISIONING STATUS MACHINE BMC HARDWARE PROFILE ONLINE ERROR
+openshift-machine-api my-node-0 OK provisioned my-node-0.example.com ipmi://1.2.3.4 unknown true
+
+
+
The 1:1 relationship for the BareMetalHost and the Machine is stored in the
+machineRef field in the BareMetalHost object:
The baremetal operator, documented at https://github.com/metal3-io/baremetal-operator/blob/master/docs/api.md, it’s the Operator in charge of definitions of physical hosts, containing information about how to reach the Out of Band management controller, URL with the desired image to provision, plus other properties related with hosts being used for provisioning instances.
+
+
Quoting from the project:
+
+
+
The Bare Metal Operator implements a Kubernetes API for managing bare metal hosts. It maintains an inventory of available hosts as instances of the BareMetalHost Custom Resource Definition. The Bare Metal Operator knows how to:
+Inspect the host’s hardware details and report them on the corresponding BareMetalHost. This includes information about CPUs, RAM, disks, NICs, and more.
+Provision hosts with a desired image
+Clean a host’s disk contents before or after provisioning.
+
+
+
A bit more in deep approach
+
+
The Baremetal Operator (BMO) keeps a mapping of each host and its management interfaces (vendor-based like iLO, iDrac, iRMC, etc) and is controlled via IPMI.
[root@metal3-kubernetes ~]#kubectl create -f metal3-node01-machine.yml
+secret/metal3-node01-user-data created
+machine.cluster.k8s.io/metal3-node01 created
+
+
+
Let’s examine the annotation created when provisioning (metal3.io/BareMetalHost):
In the output above, the host assigned was the one we’ve defined earlier as well as the other parameters like IP’s, etc generated.
+
+
+
+
Now, if we check baremetal hosts, we can see how it’s getting provisioned:
+
+
[root@metal3-kubernetes ~]#kubectl get baremetalhost -n metal3
+NAME STATUS PROVISIONING STATUS CONSUMER BMC HARDWARE PROFILE ONLINE ERROR
+metal3-node01 OK provisioned ipmi://172.22.0.1:6230 true
+
+
+
And also, check it via the ironic command:
+
+
[root@metal3-kubernetes ~]#export OS_TOKEN=fake-token ;export OS_URL=http://localhost:6385 ; openstack baremetal node list
++--------------------------------------+---------------+--------------------------------------+-------------+--------------------+-------------+
+| UUID | Name | Instance UUID | Power State | Provisioning State | Maintenance |
++--------------------------------------+---------------+--------------------------------------+-------------+--------------------+-------------+
+| 7551cfb4-d758-4ad8-9188-859ee53cf298 | metal3-node01 | 7551cfb4-d758-4ad8-9188-859ee53cf298 | power on | active | False |
++--------------------------------------+---------------+--------------------------------------+-------------+--------------------+-------------+
+
+
+
Wrap-up
+
+
We’ve seen how via a CRD we’ve defined credentials for a baremetal host to make it available to get provisioned and how we’ve also defined a machine that was provisioned on top of that baremetal host.
+
+
+
+
Ironic was chosen as the initial provider for baremetal provisioning, check Ironic documentation for more details about Ironic usage in Metal³ ↩
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/blog/2019/10/31/OpenStack-Ironic-and-Bare-Metal-Infrastructure_All-Abstractions-Start-Somewhere.html b/blog/2019/10/31/OpenStack-Ironic-and-Bare-Metal-Infrastructure_All-Abstractions-Start-Somewhere.html
new file mode 100644
index 000000000..e149b105d
--- /dev/null
+++ b/blog/2019/10/31/OpenStack-Ironic-and-Bare-Metal-Infrastructure_All-Abstractions-Start-Somewhere.html
@@ -0,0 +1,621 @@
+
+
+
+
+
+
+
+
+
+
+
+ OpenStack Ironic and Bare Metal Infrastructure: All Abstractions Start Somewhere - Chris Hoge, OpenStack Foundation; Julia Kreger, Red Hat | Metal³ - Metal Kubed
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Conference talk: OpenStack Ironic and Bare Metal Infrastructure: All Abstractions Start Somewhere
+
+
The history of cloud computing has rapidly layered abstractions on abstractions to deliver applications faster, more reliably, and easier. Serverless functions on top of containers on top of virtualization. However, at the bottom of every stack is physical hardware that has an entire lifecycle that needs to be managed.
+
+
In this video, Chris and Julia show how OpenStack Ironic is a solution to the problem of managing bare-metal infrastructure.
+
+
+
+
+
+
+
+
Speakers
+
+
Chris Hoge is a Senior Strategic Program Manager for the OpenStack foundation. He’s been an active contributor to the Interop Working Group (formerly DefCore) and helps run the trademark program for the OpenStack Foundation. He also works on collaborations between the OpenStack and Kubernetes communities. Previously he worked as an OpenStack community manager and developer at Puppet Labs and operated a research cloud for the College of Arts and Sciences at The University of Oregon. When not cloud computing, he enjoys long-distance running, dancing, and throwing a ball for his Border Collie.
+
+
Julia Kreger is Principal Software Engineer at Red Hat. She started her career in networking and eventually shifted to systems engineering. The DevOps movement leads her into software development and the operationalization of software due to the need to automate large-scale systems deployments. She is experienced in conveying an operational perspective while bridging that with requirements and doesn’t mind getting deep down into code to solve a problem.
+She is an active core contributor and leader in the OpenStack Ironic project, which is a project she feels passionate about due to many misspent hours in data centres deploying hardware. Prior to OpenStack, Julia contributed to the Shared Learning Infrastructure and worked with large-scale litigation database systems.
Conference talk: Open Infrastructure Days UK 2019; Kubernetes-native Infrastructure: Managed Baremetal with Kubernetes Operators and OpenStack Ironic - Steve Hardy, Red Hat
+
+
In this session, you can hear about a new effort to enable baremetal Kubernetes deployments using native interfaces, and in particular, the Kubernetes Operator framework, combined with OpenStack Ironic.
+
+
This approach aims to seamlessly integrate your infrastructure with your workloads, including baremetal servers, storage and container/VM workloads. All this can be achieved using kubernetes native applications, combined with existing, proven deployment and storage tooling.
+
+
In this talk, we cover the options around Kubernetes deployments today, the specific approach taken by the new Kubernetes-native “MetalKube” project, and the status/roadmap of this new community effort.
+
+
Speakers
+
+
Steve Hardy is a Senior Principal Software Engineer at Red Hat, currently involved in kubernetes/OpenShift deployment and architecture. He is also an active member of the OpenStack community and has been a project team leader of both the Heat (orchestration) and TripleO (deployment) projects.
Conference talk: Extend Your Data Center to the Hybrid Edge - Red Hat Summit, May 2019, Paul Cormier, Burr Stutter and Garima Sharma
+
+
A critical part of being successful in the hybrid cloud is being successful in your data centre with your own infrastructure.
+
+
In this video, Paul Cormier, Burr Sutter and Garima Sharma show how you can bring the Open Hybrid cloud to the edge. Cluster management from multiple cloud providers to on-premise. In the demo you’ll see a multi-cluster inventory for the open hybrid cloud at cloud.redhat.com, OpenShift Container Storage providing storage for Virtual Machines and containers (Cloud, Virtualization and bare metal), and everything Kubernetes native.
+
+
Speakers
+
+
Paul Cormier Executive vice president and president, Products and Technologies. Leads Red Hat’s technology and products organizations, including engineering, product management, and product marketing for Red Hat’s technologies. He joined Red Hat in May 2001 as executive vice president, Engineering.
+
+
Burr Sutter A lifelong developer advocate, community organizer, and technology evangelist, Burr Sutter is a featured speaker at technology events around the globe —from Bangalore to Brussels and Berlin to Beijing (and most parts in between)— he is currently Director of Developer Experience at Red Hat. A Java Champion since 2005 and former president of the Atlanta Java User Group, Burr founded the DevNexus conference —now the second largest Java event in the U.S.— with the aim of making access to the world’s leading developers affordable to the developer community.
+
+
+
Garima Sharma Senior Engineering leader at the world’s largest Open Source company. As a seasoned Tech professional, she runs a global team of Solutions Engineers focused on a large portfolio of Cloud Computing products and technology. She has helped shape science and technology for mission-critical software, reliability in operations and re-design of architecture all geared towards advancements in medicine, security, cloud technologies and bottom-line savings for the client businesses. Whether leading the architecture, development and delivery of customer-centric cutting-edge systems or spearheading diversity and inclusion initiatives via keynotes, blogs and conference presentations, Garima champions the idea of STEM. Garima ardently believes in Maya Angelou’s message that diversity makes for a rich tapestry, and we must understand that all the threads of the tapestry are equal in value no matter what their color.
Conference talk: Introducing Metal³: Kubernetes Native Bare Metal Host Management - Russell Bryant & Doug Hellmann, Red Hat
+
+
Metal³ (metal cubed/Kube) is a new open-source bare metal host provisioning tool created to enable Kubernetes-native infrastructure management. Metal³ enables the management of bare metal hosts via custom resources managed through the Kubernetes API as well as the monitoring of bare metal host metrics to Prometheus. This presentation will explain the motivations behind creating the project and what has been accomplished so far. This will be followed by an architectural overview and description of the Custom Resource Definitions (CRDs) for describing bare metal hosts, leading to a demonstration of using Metal³ in a Kubernetes cluster.
+
+
In this video, Russell Bryant and Doug Hellmann speak about the what’s and how’s of Metal³, a new tool that enables the management of bare metal hosts via custom resources managed through the Kubernetes API.
+
+
+
+
+
+
+
+
Speakers
+
+
Russell Bryant Russell Bryant is a Distinguished Engineer at Red Hat, where he works on infrastructure management to support Kubernetes clusters. Prior to working on the Metal³ project, Russell worked on other open infrastructure projects. Russell worked in Software Defined Networking with Open vSwitch (OVS) and Open Virtual Network (OVN) and worked on various parts of OpenStack. Russell also worked in open source telephony via the Asterisk project.
+
+
Doug Hellmann Doug Hellmann is a Senior Principal Software Engineer at Red Hat. He has been a professional developer since the mid-1990s and has worked on a variety of projects in fields such as mapping, medical news publishing, banking, data centre automation, and hardware provisioning. He has been contributing to open-source projects for most of his career and for the past 7 years he has been focusing on open-source cloud computing technologies, including OpenStack and Kubernetes.
Conference talk: Metal³: Deploy Kubernetes on Bare Metal - Yolanda Robla, Red Hat
+
+
Some of the most influential minds in the developer industry were landing in the gorgeous ancient city of Split, Croatia, to talk at the Shift Dev 2019 - Developer Conference about the most cutting-edge technologies, techniques and biggest trends in the developer space.
+
+
In this video, Yolanda Robla speaks about the deployment of Kubernetes on Bare Metal with the help of Metal³, a new tool that enables the management of bare metal hosts via custom resources managed through the Kubernetes API.
+
+
+
+
+
+
+
+
Speakers
+
+
Yolanda Robla Yolanda Robla is a Principal Software Engineer at Red Hat. In her own words:
+
+
+
In my current position in Red Hat as an NFV Partner Engineer, I investigate new technologies and create proofs of concept for partners to embrace new technologies. Being the current PTL of Akraino, I am involved in designing and implementing systems based on Kubernetes for the Edge use cases, ensuring high scalability and reproducibility using a GitOps approach.
The metal3-dev-env is a
+collection of scripts in a GitHub repository inside the
+Metal³ project that aims to
+allow contributors and other interested users to run a fully functional
+Metal³ environment for testing and have a first contact with the
+project. Actually, metal3-dev-env sets up an emulated environment
+which creates a set of virtual machines (VMs) to manage as if they were
+bare metal hosts.
+
+
Warning
This is not an installation that is supposed to be run in production.
+Instead, it is focused on providing a development environment to test
+and validate new features.
+
+
+
+
The metal3-dev-env repository includes a set of scripts, libraries and
+resources used to set up a Metal³ development environment. On the
+Metal³ website
+there is already a documented process on how to use the metal3-dev-env
+scripts to set up a fully functional cluster to test the functionality
+of the Metal³ components.
+
+
This procedure at a 10,000-foot view is composed of 3 bash scripts plus
+a verification one:
+
+
+
+
01_prepare_host.sh - Mainly installs all needed packages.
+
02_configure_host.sh - Basically create a set of VMs that will be
+managed as if they were bare metal hosts. It also downloads some
+images needed for Ironic.
+
03_launch_mgmt_cluster.sh - Launches a management cluster using
+minikube and runs the baremetal-operator on that cluster.
+
04_verify.sh - Finally runs a set of tests that verify that the
+deployment was completed successfully
+
+
+
In this blog post, we are going to expand the information and provide
+some hints and recommendations.
+
+
Warning
Metal³ project is changing rapidly, so probably this information is
+valuable in the short term. In any case, it is encouraged to
+double-check that the information provided is still valid.
+
+
+
+
Before getting down to it, it is worth defining the nomenclature used in the blog post:
+
+
+
Host. It is the server where the virtual environment is running.
+In this case, it is a physical PowerEdge M520 with 2 x Intel(R)
+Xeon(R) CPU E5-2450 v2 @ 2.50GHz, 96GB RAM and a 140GB drive running
+CentOS 7 latest. Do not panic, lab environment should work with lower
+resources as well.
+
Virtual bare metal hosts. These are the virtual machines (KVM
+based) that are running on the host which are emulating physical hosts
+in our lab. They are also called bare metal hosts even if they are not
+physical servers.
+
Management or bootstrap cluster. It is a fully functional
+Kubernetes cluster in charge of running all the necessary Metal³
+operators and controllers to manage the infrastructure. In this case
+it is the minikube virtual machine.
+
Target cluster. It is the Kubernetes cluster created from the
+management one. It is provisioned and configured using a native
+Kubernetes API for that purpose.
+
+
+
Create the Metal³ laboratory
+
+
Information
A non-root user must exist in the host with password-less sudo access.
+This user is in charge of running the metal3-dev-env scripts.
+
+
+
+
The first thing that needs to be done is, obviously, cloning the
+metal3-dev-env repository:
Before starting to deploy the Metal³ environment, it makes sense to
+detail a series of scripts inside the library folder that will be
+sourced in every step of the installation process. They are called
+shared libraries.
Although there are several scripts placed inside the lib folder that are
+sourced in some of the deployment steps, common.sh and logging.sh
+are the only ones used in all of the executions during the installation
+process.
+
+
common.sh
+
+
The first time this library is run, a new configuration file is created
+with several variables along with their default values. They will be
+used during the installation process. On the other hand, if the file
+already exists, then it just sources the values configured. The
+configuration file is created inside the cloned folder with
+config_$USER as the file name.
+
+
[alosadag@eko1 metal3-dev-env]$ ls config_*
+config_alosadag.sh
+
+
+
The configuration file contains multiple variables that will be used
+during the set-up. Some of them are detailed in the setup section of
+the Metal³ try-it web page.
+In case you need to add or change global variables it should be done in
+this config file.
+
+
Note
I personally recommend modifying or adding variables in this config
+file instead of exporting them in the shell. By doing that, it is
+assured that they are persisted
+
+
+
+
[alosadag@eko1 metal3-dev-env]$ cat ~/metal3-dev-env/config_alosadag.sh
+#!/bin/bash
+#
+# This is the subnet used on the "baremetal" libvirt network, created as the
+# primary network interface for the virtual bare metalhosts.
+#
+# Default of 192.168.111.0/24 set in lib/common.sh
+#
+#export EXTERNAL_SUBNET="192.168.111.0/24"
+#
+# This SSH key will be automatically injected into the provisioned host
+# by the provision_host.sh script.
+#
+# Default of ~/.ssh/id_rsa.pub is set in lib/common.sh
+#
+#export SSH_PUB_KEY=~/.ssh/id_rsa.pub
+...
+
+
+
This common.sh library also makes sure there is an ssh public key
+available in the user’s ssh folder. This key will be injected by
+cloud-init in all the virtual bare metal machines that will be
+configured later. Then, the user that executed the metal3-dev-env
+scripts is able to access the target cluster through ssh.
+
+
Also, common.sh library also sets more global variables apart from
+those in the config file. Note that these variables can be added to the
+config file along with the proper values for your environment.
It is important to mention that there are several basic functions
+defined in this file that will be used by the rest of scripts.
+
+
+
+
logging.sh
+
+
This script ensures that there is a log folder where all the information
+gathered during the execution of the scripts is stored. If there is any
+issue during the deployment, this is one of the first places to look at.
In this first step (01_prepare_host.sh), the requirements needed to
+start the preparation of the host where the virtual bare metal hosts
+will run are fulfilled. Depending on the host’s operating system (OS),
+it will trigger a specific script for CentOS/Red Hat or Ubuntu.
As stated previously, CentOS 7 is the operating system chosen to run
+in both, the host and virtual servers. Therefore, specific packages of
+the operating system are applied in the following script:
+
+
+
+
centos_install_requirements.sh
+This script enables epel and tripleo (current-tripleo)
+repositories where several packages are installed: dnf, ansible,
+wget, python3 and python related packages such as
+python-virtualbmc from tripleo repository.
+
+
+
Note
Notice that SELinux is set to permissive and an OS update is
+triggered, which will cause several packages to be upgraded since
+there are newer packages in the tripleo repositories (mostly python
+related) than in the rest of enabled repositories. At this point, the
+container runtime is also installed. Note that by setting the variable
+CONTAINER_RUNTIME defined in common.sh is possible to
+choose between docker and podman, which is the default for CentOS.
+Remember that this behavior can be overwritten in your config file.
+
+
+
+
Once the specific requirements for the elected operating system are
+accomplished, the download of several external artifacts is executed.
+Actually minikube, kubectl and kustomize are downloaded from the
+internet. Notice that the version of Kustomize and Kubernetes is defined
+by KUSTOMIZE_VERSION and KUBERNETES_VERSION variables inside
+common.sh, but minikube is always downloading the latest
+version available.
+
+
The next step deals with cleaning ironic containers and pods that
+could be running in the host from failed deployments. This will ensure
+that there will be no issues when creating ironic-pod and infra-pod
+a little bit later in this first step.
+
+
+
+
network.sh.
+
+
+
At this point, the network library script is sourced. As expected,
+this library deals with the network configuration which includes: IP
+addresses, network definitions and IPv6 support which is disabled by
+default by setting PROVISIONING_IPV6 variable:
+
+
+
+
+
Name of the variable
+
Default value
+
Option
+
+
+
+
+
PROVISIONING_NETWORK
+
172.22.0.0/24
+
This is the subnet used to run the OS provisioning process
+
+
+
EXTERNAL_SUBNET
+
192.168.111.0/24
+
This is the subnet used on the “baremetal” libvirt network, created as the primary network interface for the virtual bare metal hosts
+
+
+
LIBVIRT_FIRMWARE
+
bios
+
+
+
+
PROVISIONING_IPV6
+
false
+
+
+
+
+
+
Below it is depicted a network diagram of the different virtual
+networks and virtual servers involved in the Metal³ environment:
+
+
+
+
+
+
+
images.sh.
+
+
+
The images.sh library file is sourced as well in script
+01_prepare_host.sh. The images.sh script contains multiple
+variables that set the URL (IMAGE_LOCATION), name (IMAGE_NAME) and
+default username (IMAGE_USERNAME) of the cloud image that needs to
+be downloaded. The values of each variable will differ depending on
+the operating system of the virtual bare metal hosts. Note that these
+images will be served from the host to the virtual servers through the
+provisioning network.
+
+
In our case, since CentOS 7 is the base operating system, values
+will be defined as:
In case it is expected to use a custom cloud image, just modify the
+previous variables to match the right location.
+
+
+
+
Now that the cloud image is defined, the download process can be
+started. First, a folder defined by IRONIC_IMAGE_DIR should exist so
+that the image (CentOS-7-x86_64-GenericCloud-1907.qcow2) and its
+checksum can be stored. This folder and its content will be exposed
+through a local ironic container running in the host.
+
+
+
+
+
Name of the variable
+
Default value
+
+
+
IRONIC_IMAGE_DIR
+
/opt/metal3-dev-env/ironic/html/images
+
+
+
+
+
Below it is verified that the cloud image files were downloaded
+successfully in the defined folder:
Once the shared script images.sh is sourced, the following container
+images are pre-cached locally to the host in order to speed up things
+later. Below is shown the code snippet in charge of that task:
+
+
+ for IMAGE_VAR in IRONIC_IMAGE IPA_DOWNLOADER_IMAGE VBMC_IMAGE SUSHY_TOOLS_IMAGE DOCKER_REGISTRY_IMAGE
++ IMAGE=quay.io/metal3-io/ironic
++ sudo podman pull quay.io/metal3-io/ironic
+...
+....
+
+
+
The container image location of each one is defined by their respective variables:
+
+
+
+
+
Name of the variable
+
Default value
+
+
+
+
+
VBMC_IMAGE
+
quay.io/metal3-io/vbmc
+
+
+
SUSHY_TOOLS_IMAGE
+
quay.io/metal3-io/sushy-tools
+
+
+
IPA_DOWNLOADER_IMAGE
+
quay.io/metal3-io/ironic-ipa-downloader
+
+
+
IRONIC_IMAGE
+
quay.io/metal3-io/ironic
+
+
+
DOCKER_REGISTRY_IMAGE
+
docker.io/registry:latest
+
+
+
+
+
Information
In case it is expected to modify the public container images to test
+new features, it is worth mentioning that there is a container
+registry running as a privileged container in the host. Therefore it
+is recommended to upload your modified images there and just overwrite
+the previous variables to match the right location.
+
+
+
+
At this point, an Ansible role is run locally in order to complete the
+local configuration.
This playbook imports two roles. One is called packages_installation,
+which is in charge of installing a few more packages. The list of
+packages installed are listed as default Ansible variables in the
+vm-setup role inside the metal3-dev-env
+repository.
+The other role is based on the
+fubarhouse.golang
+Ansible Galaxy role. It is in charge of installing and configuring the
+exact golang version 1.12.12 defined in an Ansible variable in the
+install-package-playbook.yml
+playbook
+
+
Once the playbook is finished, a pod called ironic-pod is created.
+Inside that pod, a privilegedironic-ipa-downloader container is
+started and attached to the host network. This container is in charge of
+downloading the Ironic Python
+Agent (IPA)
+files to a shared volume defined by IRONIC_IMAGE_DIR. This folder is
+exposed by the ironic container through HTTP.
+
+
Information
The Ironic Python
+Agent is an
+agent for controlling and deploying Ironic controlled baremetal nodes.
+Typically run in a ramdisk, the agent exposes a REST API for
+provisioning servers.
Below is shown the status of the pods and containers at this point:
+
+
[root@eko1 metal3-dev-env]# podman pod list --ctr-names
+POD ID NAME STATUS CREATED CONTAINER INFO INFRA ID
+5a0d475351aa ironic-pod Running 6 days ago [5a0d475351aa-infra] [ipa-downloader] 18f3a8f61407
+
+
+
The process will wait until the ironic-python-agent (IPA) initramfs,
+kernel and headers files are downloaded successfully. See below the
+files downloaded along with the CentOS 7 cloud image:
Afterwards, the script makes sure that libvirt is running successfully
+on the host and that the non-privileged user has permission to interact
+with it. Libvirt daemon should be running so that minikube can be
+installed successfully. See the following script snippet starting the
+minikube VM:
+
+
+ sudo su -l-c'minikube start --insecure-registry 192.168.111.1:5000'
+* minikube v1.6.2 on Centos 7.7.1908
+* Selecting 'kvm2' driver from user configuration (alternates: [none])
+
+
+
In the same way, as with the host, container images are pre-cached but
+in this case inside minikube local image repository. Notice that in this
+case the Bare Metal
+operator (BMO) is
+also downloaded since it will run on minikube. The container location is
+defined by BAREMETAL_OPERATOR_IMAGE. In case you want to test new
+features or new fixes to the BMO, just change the value of the variable
+to match the location of the modified image:
+
+
+
+
+
Name of the variable
+
Default value
+
+
+
BAREMETAL_OPERATOR_IMAGE
+
quay.io/metal3-io/baremetal-operator
+
+
+
+
+
Note
Remember that minikube is the management cluster in our environment.
+So it must run all the operators and controllers needed for Metal³.
+
+
+
+
Below is shown the output of the script once all the container images
+have been pulled to minikube:
+
+
+ sudo su -l-c'minikube ssh sudo docker image ls' alosadag
+REPOSITORY TAG IMAGE ID CREATED SIZE
+quay.io/metal3-io/ironic latest e5d81adf05ee 26 hours ago 693MB
+quay.io/metal3-io/ironic-ipa-downloader latest d55b0dac2144 6 days ago 239MB
+quay.io/metal3-io/ironic-inspector latest 8bb5b844ada6 6 days ago 408MB
+quay.io/metal3-io/baremetal-operator latest 3c692a32ddd6 9 days ago 1.77GB
+k8s.gcr.io/kube-proxy v1.17.0 7d54289267dc 7 weeks ago 116MB
+k8s.gcr.io/kube-controller-manager v1.17.0 5eb3b7486872 7 weeks ago 161MB
+k8s.gcr.io/kube-scheduler v1.17.0 78c190f736b1 7 weeks ago 94.4MB
+k8s.gcr.io/kube-apiserver v1.17.0 0cae8d5cc64c 7 weeks ago 171MB
+kubernetesui/dashboard v2.0.0-beta8 eb51a3597525 7 weeks ago 90.8MB
+k8s.gcr.io/coredns 1.6.5 70f311871ae1 2 months ago 41.6MB
+k8s.gcr.io/etcd 3.4.3-0 303ce5db0e90 3 months ago 288MB
+kubernetesui/metrics-scraper v1.0.2 3b08661dc379 3 months ago 40.1MB
+k8s.gcr.io/kube-addon-manager v9.0.2 bd12a212f9dc 6 months ago 83.1MB
+k8s.gcr.io/pause 3.1 da86e6ba6ca1 2 years ago 742kB
+gcr.io/k8s-minikube/storage-provisioner v1.8.1 4689081edb10 2 years ago 80.8MB
+
+
+
Once the container images are stored, minikube can be stopped. At that
+moment, the virtual networks shown in the previous picture are attached
+to the minikube VM as can be verified by the following command:
At this point the host is ready to create the virtual infrastructure.
+
+
+
+
The video below exhibits all the configurations explained and executed
+during this first step.
+
+
+
+
+
+
+
+
Step 2: Configure the host
+
+
In this step, the script 02_configure_host.sh basically configures the
+libvirt/KVM virtual infrastructure and starts services in the host that
+will be consumed by the virtual bare metal machines:
+
+
+
Web server to expose the ironic-python-agent (IPA) initramfs,
+kernel, headers and operating system cloud images.
+
Virtual BMC to emulate a real baseboard management controller (BMC).
+
Container registry where the virtual servers will pull the images
+needed to run a K8s installation.
+
+
+
Information
A baseboard management controller (BMC) is a specialized service
+processor that monitors the physical state of a computer, network
+server or other hardware device using sensors and communicating with
+the system administrator through an independent connection. The BMC is
+part of the Intelligent Platform Management Interface (IPMI) and is
+usually contained in the motherboard or main circuit board of the
+device to be monitored.
+
+
+
+
First, an ssh-key in charge of communicating to libvirt is created if it
+does not exist previously. This key is called id_rsa_virt_power. It is
+added to the root authorized_keys and is used by vbmc and sushy tools
+to contact libvirt.
+
+
Information
sushy-tools is a set of simple simulation tools aiming at supporting
+the development and testing of the Redfish protocol implementations.
+
+
+
+
Next, another Ansible playbook called
+setup-playbook.yml
+is run against the host. It is focused on setting up the virtual
+infrastructure around metal3-dev-env. Below it is shown the Ansible
+variables that are passed to the playbook, which actually are obtaining
+the values from the global variables defined in the
+common.sh or the configuration file.
There are variables that are only defined as Ansible variables, e.g.
+number of CPUs of the virtual bare metal server, size of disks, etc.
+In case you would like to change properties not defined globally in
+the metal3-dev-env take a look at the default variables specified in
+role:
+common
+and
+libvirt
+
+
+
+
The setup-playbook.yml is composed by 3 roles, which are detailed below:
+
+
+
+
Common.
+
+
+
This role sets up the virtual hardware and network configuration of
+the VMs. Actually it is a
+dependency
+of the libvirt and virtbmc Ansible roles. This means that the
+common role must always be executed before the roles that depend on
+them. Also, they are only executed once. If two roles state the same
+one as their dependency, it is only executed the first time.
+
+
+
Libvirt.
+
+
+
It actually is the role that configures the virtual bare metal
+servers. They are all identically defined with the same hardware and
+network configuration. Note that they are not started since they will
+be booted later by ironic during the provisioning process.
+
+
+
Note
It is possible to change the number of VMs to provision by replacing
+the value of NUMBER_NODES
+
+
Finally, once the VMs are defined and we have their MAC address, the
+ironic inventory file ironic_nodes_json is created. The action of
+creating a node is part of the enrollment process and the first step
+to prepare a node to reach the available status.
This role is also used to tear down the virtual infrastructure
+depending on the variable
+libvirt_action
+inside the Ansible role: setup or teardown.
+
+
+
VirtBMC
+
+
+
This role is only executed if the bare metal virtual machines are
+created in libvirt, because vbmc needs libvirt to emulate a real
+BMC.
+
+
info “Information”
+VirtualBMC (vmbc) tool simulates a Baseboard Management Controller
+(BMC) by exposing IPMI responder to the network and talking to libvirt
+at the host vBMC is running at. Basically, manipulate virtual machines
+which pretend to be bare metal servers.
+
+
The virtbmc Ansible role creates the vbmc and sushy-tools
+configuration in the host for each virtual bare metal nodes. Note that
+each virtual bare metal host will have a different vbmc socket
+exposed in the host. The communication to each vbmc is needed by the
+BMO to start, stop, configure the boot order, etc during the
+provisioning stage. Finally, this folders containing the configuration
+will be mounted by the vbmc and sushy-tools containers.
Next, both host provisioning and baremetal interfaces are configured.
+The provisioning interface, as the name suggests, will be used to
+provision the virtual bare metal hosts by means of the Bare Metal
+Operator. This interface is configured with an static IP (172.22.0.1):
On the other hand, the baremetal virtual interface behaves as an
+external network. This interface is able to reach the internet and it is
+the network where the different Kubernetes nodes will exchange
+information. This interface is configured as auto, so the IP is
+retrieved by DHCP.
Next, an Ansible role called
+firewall
+will be executed targeting the host to be sure that the proper ports
+are opened. In case your host is running Red Hat Enterprise Linux or
+CentOS 8, firewall module will be used. In any other case, iptables
+module is the choice.
+
+
Below is the code snippet where firewalld or iptables is assigned:
This behavior can be changed by replacing the value of the
+USE_FIREWALLD variable
+
+
+
+
The ports managed by this role are all associated with the services that
+take part in the provisioning process: ironic, vbmc, httpd, pxe,
+container registry..
+
+
Note
Services like ironic, pxe, keepalived, httpd and the container
+registry are running in the host as containers attached to the host
+network on the host’s provisioning interface. On the other hand, the
+vbmc service is also running as a privileged container and it is
+listening in the host’s baremetal interface.
+
+
+
+
Once the network is configured, a local container registry is started.
+It will be needed in the case of using locally built images. In that
+case, the container images can be modified locally and pushed to the
+local registry. At that point, the specific image location variable must
+be changed so it must point out the local registry. This process makes
+it easy to verify and test changes to the code locally.
+
+
At this point, the following containers are running inside two pods on
+the host: infra-pod and ironic-pod.
+
+
[root@eko1 metal3-dev-env]# podman pod list --ctr-names
+POD ID NAME STATUS CREATED CONTAINER INFO INFRA ID
+67cc53713145 infra-pod Running 6 days ago [vbmc] [sushy-tools] [httpd-infra] [67cc53713145-infra] f1da23fcd77f
+5a0d475351aa ironic-pod Running 6 days ago [5a0d475351aa-infra] [ipa-downloader] 18f3a8f61407
+
+
+
Below are detailed the containers inside the infra-pod pod which are
+running as privileged using the host network:
+
+
+
+
The httpd container. > >
+A folder called shared where the cloud OS image and IPA files are
+available is mounted and exposed to the virtual bare metal hosts.
This folder also contains the inspector.ipxe file which contains the
+information needed to be able to run the ironic-python-agent kernel
+and initramfs. Below, httpd-infra container is accessed and it has
+been verified that host’s /opt/metal3-dev-env/ironic/
+(IRONIC_DATA_DIR) is mounted inside the shared folder of the
+container:
This container mounts two host folders: one is
+/opt/metal3-dev-env/virtualbmc/vbmc where vbmc configuration for
+each node is stored, the other folder is /root/.ssh where root keys
+are located, specifically id_rsa_virt_power which is used to manage
+the communication with libvirt.
This container mounts the /opt/metal3-dev-env/virtualbmc/sushy-tools
+config folder and the /root/.ssh local folder as well. The
+functionality is similar as the vbmc, however this use redfish
+instead of ipmi to connect to the BMC.
At this point the virtual infrastructure must be ready to apply the
+Kubernetes specific configuration. Note that all the VMs specified by
+NUMBER_NODES and minikube must be shut down and the defined virtual
+network must be active:
+
+
+
+
[alosadag@smc-master metal3-dev-env]$ sudo virsh list --all
+ Id Name State
+----------------------------------------------------
+ - minikube shut off
+ - node_0 shut off
+ - node_1 shut off
+ - node_2 shut off
+
+
+[alosadag@smc-master metal3-dev-env]$ sudo virsh net-list --all
+ Name State Autostart Persistent
+----------------------------------------------------------
+ baremetal active yes yes
+ default active yes yes
+ minikube-net active yes yes
+ provisioning active yes yes
+
+
+
In the video below it is exhibited all the configuration explained and
+executed during this second step.
+
+
+
+
+
+
+
+
Step 3: Launch the management cluster (minikube)
+
+
The third script called 03_launch_mgmt_cluster.sh basically configures
+minikube to become a Metal³ management cluster. On top of minikube the
+baremetal-operator, capi-controller-manager,
+capbm-controller-manager and cabpk-controller-manager are installed
+in the metal3 namespace.
+
+
In a more detailed way, the script clones the Bare Metal Operator
+(BMO) and Cluster
+API Provider for Managed Bare Metal Hardware operator
+(CAPBM)
+git repositories, creates the cloud.yaml file and starts the minikube
+virtual machine. Once minikube is up and running, the BMO is built and
+executed in minikube’s Kubernetes cluster.
+
+
In the case of the Bare Metal Operator, the branch by default to clone
+is master, however, this and other variables shown in the following
+table can be replaced in the config file:
Once the BMO variables are configured, it is time for the operator to
+be deployed using kustomize and kubectl as it can seen from the
+logs:
+
+
+
Information:Kustomize is a Kubernetes tool that lets you customize raw, template-free YAML files for multiple purposes, leaving the original YAML untouched and usable as is.
+
+
+
+ kustomize build bmo-dirPrHIrcl
++ kubectl apply -f-
+namespace/metal3 created
+customresourcedefinition.apiextensions.k8s.io/baremetalhosts.metal3.io created
+serviceaccount/metal3-baremetal-operator created
+clusterrole.rbac.authorization.k8s.io/metal3-baremetal-operator created
+clusterrolebinding.rbac.authorization.k8s.io/metal3-baremetal-operator created
+configmap/ironic-bmo-configmap-75tkt49k5c created
+secret/mariadb-password-d88m524c46 created
+deployment.apps/metal3-baremetal-operator created
+
+
+
Once the BMO objects are applied, it’s time to transform the virtual
+bare metal hosts information into a yaml file of kind BareMetalHost
+Custom Resource (CR). This is done by a golang script passing them the
+IPMI address, BMC username and password, which are stored as a
+Kubernetes secret, MAC address and name:
+
+
+ go run /home/alosadag/go/src/github.com/metal3-io/baremetal-operator/cmd/make-bm-worker/main.go -address ipmi://192.168.111.1:6230 -password password -user admin -boot-mac 00:be:bc:fd:17:f3 node-0
++ read-r name address user password mac
++ go run /home/alosadag/go/src/github.com/metal3-io/baremetal-operator/cmd/make-bm-worker/main.go -address ipmi://192.168.111.1:6231 -password password -user admin -boot-mac 00:be:bc:fd:17:f7 node-1
++ read-r name address user password mac
++ go run /home/alosadag/go/src/github.com/metal3-io/baremetal-operator/cmd/make-bm-worker/main.go -address ipmi://192.168.111.1:6232 -password password -user admin -boot-mac 00:be:bc:fd:17:fb node-2
++ read-r name address user password mac
+
+
+
Below is shown the bare metal host definition of node-1. Note that the
+IPMI address is the IP of the host’s provisioning interface. Behind the
+scenes, IPMI is handled by the vbmc container running in the host.
See that the MAC address configured in the BareMetalHost spec
+definition matches node-1 provisioning interface:
+
+
[root@eko1 metal3-dev-env]# virsh domiflist node_1
+Interface Type Source Model MAC
+-------------------------------------------------------
+vnet4 bridge provisioning virtio 00:00:e0:4b:24:8f
+vnet5 bridge baremetal virtio 00:00:e0:4b:24:91
+
+
+
Finally, the script apply in namespace metal3 each of the
+BareMetalHost yaml files that match each virtual bare metal host:
+
+
+ kubectl apply -f bmhosts_crs.yaml -n metal3
+secret/node-0-bmc-secret created
+baremetalhost.metal3.io/node-0 created
+secret/node-1-bmc-secret created
+baremetalhost.metal3.io/node-1 created
+secret/node-2-bmc-secret created
+baremetalhost.metal3.io/node-2 created
+
+
+
Lastly, it is the turn of the CAPBM. Similar to BMO, kustomize is
+used to create the different Kubernetes components and kubectl applied
+the files into the management cluster.
Then, kustomize configures the files accordingly to the values defined
+and kubectl applies them:
+
+
+ kustomize build capbm-eJPOjCPASD
++ kubectl apply -f-
+namespace/cabpk-system created
+namespace/capbm-system created
+namespace/capi-system created
+customresourcedefinition.apiextensions.k8s.io/baremetalclusters.infrastructure.cluster.x-k8s.io created
+customresourcedefinition.apiextensions.k8s.io/baremetalmachines.infrastructure.cluster.x-k8s.io created
+customresourcedefinition.apiextensions.k8s.io/baremetalmachinetemplates.infrastructure.cluster.x-k8s.io created
+customresourcedefinition.apiextensions.k8s.io/clusters.cluster.x-k8s.io created
+customresourcedefinition.apiextensions.k8s.io/kubeadmconfigs.bootstrap.cluster.x-k8s.io created
+customresourcedefinition.apiextensions.k8s.io/kubeadmconfigtemplates.bootstrap.cluster.x-k8s.io created
+customresourcedefinition.apiextensions.k8s.io/machinedeployments.cluster.x-k8s.io created
+customresourcedefinition.apiextensions.k8s.io/machines.cluster.x-k8s.io created
+customresourcedefinition.apiextensions.k8s.io/machinesets.cluster.x-k8s.io created
+role.rbac.authorization.k8s.io/cabpk-leader-election-role created
+role.rbac.authorization.k8s.io/capbm-leader-election-role created
+role.rbac.authorization.k8s.io/capi-leader-election-role created
+clusterrole.rbac.authorization.k8s.io/cabpk-manager-role created
+clusterrole.rbac.authorization.k8s.io/cabpk-proxy-role created
+clusterrole.rbac.authorization.k8s.io/capbm-manager-role created
+clusterrole.rbac.authorization.k8s.io/capbm-proxy-role created
+clusterrole.rbac.authorization.k8s.io/capi-manager-role created
+rolebinding.rbac.authorization.k8s.io/cabpk-leader-election-rolebinding created
+rolebinding.rbac.authorization.k8s.io/capbm-leader-election-rolebinding created
+rolebinding.rbac.authorization.k8s.io/capi-leader-election-rolebinding created
+clusterrolebinding.rbac.authorization.k8s.io/cabpk-manager-rolebinding created
+clusterrolebinding.rbac.authorization.k8s.io/cabpk-proxy-rolebinding created
+clusterrolebinding.rbac.authorization.k8s.io/capbm-manager-rolebinding created
+clusterrolebinding.rbac.authorization.k8s.io/capbm-proxy-rolebinding created
+clusterrolebinding.rbac.authorization.k8s.io/capi-manager-rolebinding created
+secret/capbm-webhook-server-secret created
+service/cabpk-controller-manager-metrics-service created
+service/capbm-controller-manager-service created
+service/capbm-controller-metrics-service created
+deployment.apps/cabpk-controller-manager created
+deployment.apps/capbm-controller-manager created
+deployment.apps/capi-controller-manager created
+
+
+
Information
At this point all controllers and operators must be running in the
+namespace metal3 of the management cluster (minikube). All virtual
+bare metal hosts configured must be shown as BareMetalHosts
+resources in the metal3 namespace as well. They should be in ready
+status and stopped (online is false)
+
+
+
+
In the video below it is exhibited all the configuration explained and executed during this third step.
+
+
+
+
+
+
+
+
Step 4: Verification
+
+
The last script 04_verify.sh is in charge of verifying that the
+deployment has been successful by checking several things:
+
+
+
Custom resources (CR) and custom resource definition (CRD) were
+applied and exist in the cluster.
+
Verify that the virtual bare metal hosts matches the information
+detailed in theBareMetalHost object.
+
All containers are in running status.
+
Verify virtual network configuration and status.
+
Verify operators and controllers are running.
+
+
+
However, this verification can be easily achieved manually. For
+instance, checking that controllers and operators running in the
+management cluster (minikube) and all the virtual bare metal hosts are
+in ready status:
Verify that the BareMetalHosts provisioning status is ready and the
+BMC configuration is correct. Check that all virtual bare metal hosts
+are shut down (online is false):
+
+
[alosadag@eko1 ~]$ kubectl get baremetalhosts -n metal3
+NAME STATUS PROVISIONING STATUS CONSUMER BMC HARDWARE PROFILE ONLINE ERROR
+node-0 OK ready ipmi://192.168.111.1:6230 unknown false
+node-1 OK ready ipmi://192.168.111.1:6231 unknown false
+node-2 OK ready ipmi://192.168.111.1:6232 unknown false
+
+
+
Get the list of CRDs created in the cluster. Check that, at least, the
+following ones exist:
+
+
[alosadag@eko1 ~]$ kubectl get crds
+NAME CREATED AT
+baremetalclusters.infrastructure.cluster.x-k8s.io 2020-01-22T13:19:42Z
+baremetalhosts.metal3.io 2020-01-22T13:19:35Z
+baremetalmachines.infrastructure.cluster.x-k8s.io 2020-01-22T13:19:42Z
+baremetalmachinetemplates.infrastructure.cluster.x-k8s.io 2020-01-22T13:19:42Z
+clusters.cluster.x-k8s.io 2020-01-22T13:19:42Z
+kubeadmconfigs.bootstrap.cluster.x-k8s.io 2020-01-22T13:19:42Z
+kubeadmconfigtemplates.bootstrap.cluster.x-k8s.io 2020-01-22T13:19:42Z
+machinedeployments.cluster.x-k8s.io 2020-01-22T13:19:43Z
+machines.cluster.x-k8s.io 2020-01-22T13:19:43Z
+machinesets.cluster.x-k8s.io 2020-01-22T13:19:43Z
+
+
+
Information
KUBECONFIG file is stored in the user’s home directory
+(~/.kube/config) that executed the scripts.
+
+
+
+
Check the status of all the applications running in minikube or better
+said, in the management cluster.
In the video below it is exhibited all the configuration explained and
+executed during the verification steps.
+
+
+
+
+
+
+
+
Summary
+
+
In this post a deep dive into the metal3-dev-env scripts was shown. It
+has been deeply detailed the process of creating a Metal³ emulated
+environment from a set of virtual machines (VMs) to manage as if they
+were bare metal hosts.
+
+
After this post, the reader should have acquired a basic understanding
+of all the pieces involved in the Metal³ project. Also, and more
+important, how these scripts can be adapted to your specific needs.
+Remember that this can be achieved in multiple ways: replacing values in
+the global variables, replacing Ansible default variables or even
+modifying playbooks or the scripts themselves.
+
+
Notice that the Metal³ development environment also focuses on
+developing new features of the BMO or CAPBM and being able to test them
+locally.
Conference talk: Metal³: Kubernetes Native Bare Metal Cluster Management - Maël Kimmerlin
+
+
On the 20th of January at the Kubernetes and CNCF Finland Meetup, Maël Kimmerlin gave a brilliant presentation about the status of the Metal³ project.
+
+
In this presentation, Maël starts giving a short introduction of the Cluster API project which provides a solid foundation to develop the Metal³ Bare Metal Operator (BMO). The talk basically focuses on the v1alpha2 infrastructure provider features from the Cluster API.
+
+
Information
The video recording from the “Kubernetes and CNCF Finland Meetup” is composed of three talks. The video embedded starts with Maël’s talk.
+
+
warning “Warning”
+Playback of the video has been disabled by the author. Click on the play button and then on the “Watch this video on Youtube” link once it appears.
+
+
+
+
+
+
+
+
+
+
During the first part of the presentation, a detailed explanation of the different Kubernetes Custom Resource Definitions (CRDs) inside Metal³ is shown as also how they are linked with the Cluster API project. As an example, the image below shows the interaction between objects and controllers from both projects:
+
+
+
+
Once finished the introductory part, Maël focuses on the main components of the Metal³ BMO and the provisioning process. This process starts with introspection, where the bare metal server is registered by the operator. Then, the Ironic Python Agent (IPA) image is executed to collect all hardware information from the server.
+
+
+
+
The second part of the process is the provisioning. In this step, Maël explains how the Bare Metal Operator (BMO) is in charge along with Ironic to present the Operating System image to the physical server and complete its installation.
+
+
+
+
Next, Maël deeply explains each Custom Resource (CR) used during the provisioning of target Kubernetes clusters in bare metal servers. He refers to objects such as Cluster, BareMetalCluster, Machine, BareMetalMachine, BareMetalHost and so on. Each one is clarified with a YAML file definition of a real case and a workflow diagram that shows the reconciliation procedure.
+
+
The last part of the talk is dedicated to executing a demo where Maël creates a target Kubernetes cluster from a running minikube VM (also called bootstrap cluster) where Metal³ is deployed. As it is pointed out in the video, the demo is running in emulated hardware. Actually, something similar to the metal3-dev-env project can be used to reproduce the demo. More information on the Metal³ development environment (metal3-dev-env) can be found in the Metal³ try-it section. In case you want to go deeper, take a look at the blog post A detailed walkthrough of the Metal³ development environment.
+
+
In the end, the result is a new Kubernetes cluster up and running. The cluster is deployed on two emulated physical servers: one runs as the control-plane node and the other as a worker node.
+
+
Information
The slides of the talk can be downloaded from here
+
+
+
+
Speakers
+
+
Maël Kimmerlin Maël Kimmerlin is a Senior Software Engineer at Ericsson. In his own words:
+
+
I am an open-source enthusiast, focusing in Ericsson on Life Cycle Management of Kubernetes clusters on Bare Metal. I am very interested in the Cluster API project from the Kubernetes Lifecycle SIG, and active in its Bare Metal provider, that is Metal³, developing and encouraging the adoption of this project.
There is no backwards compatibility between v1alpha3 and v1alpha2 releases of
+the Cluster API provider for Metal3.
+
+
+
+
For the v1alpha3 release of Cluster API, the Metal3 provider was renamed from
+cluster-api-provider-baremetal to cluster-api-provider-metal3. The Custom
+Resource Definitions were also modified. This post dives into the changes.
+
+
Repository renaming
+
+
From v1alpha3 onwards, the Cluster API provider will be developed in
+cluster-api-provider-metal3.
+The v1alpha1 and v1alpha2 content will remain in
+cluster-api-provider-baremetal.
+This repository will be archived but kept for the integration in metal3-dev-env.
+
+
Custom Resource Definition modifications
+
+
The kind of Custom Resource Definition (CRD) has been modified for the
+following objects:
+
+
+
BareMetalCluster -> Metal3Cluster
+
baremetalcluster -> metal3cluster
+
BareMetalMachine -> Metal3Machine
+
baremetalmachine -> metal3machine
+
BareMetalMachineTemplate -> Metal3MachineTemplate
+
baremetalmachinetemplate -> metal3machinetemplate
+
+
+
The custom resources deployed need to be modified accordingly.
+
+
Deployment modifications
+
+
The prefix of all deployed components for the Metal3 provider was modified
+from capbm- to capm3-. The namespace in which the components are deployed by
+default was modified from capbm-system to capm3-system.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/blog/2020/06/18/Metal3-dev-env-BareMetal-Cluster-Deployment.html b/blog/2020/06/18/Metal3-dev-env-BareMetal-Cluster-Deployment.html
new file mode 100644
index 000000000..06c9f9b5a
--- /dev/null
+++ b/blog/2020/06/18/Metal3-dev-env-BareMetal-Cluster-Deployment.html
@@ -0,0 +1,1166 @@
+
+
+
+
+
+
+
+
+
+
+
+ Metal³ development environment walkthrough part 2: Deploying a new bare metal cluster | Metal³ - Metal Kubed
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
This blog post describes how to deploy a bare metal cluster, a virtual
+one for simplicity, using
+Metal³/metal3-dev-env. We
+will briefly discuss the steps involved in setting up the cluster as
+well as some of the customization available. If you want to know more
+about the architecture of Metal³, this blogpost can be helpful.
+
+
This post builds upon the detailed metal3-dev-env walkthrough
+blogpost
+which describes in detail the steps involved in the environment set-up
+and management cluster configuration. Here we will use that environment
+to deploy a new Kubernetes cluster using Metal³.
+
+
Before we get started, there are a couple of requirements we are
+expecting to be fulfilled.
The appropriate environment variables are setup via shell or in the
+config_${user}.sh file, for example
+
+
CAPM3_VERSION
+
NUM_NODES
+
CLUSTER_NAME
+
+
+
+
+
Overview of Config and Resource types
+
+
In this section, we give a brief overview of the important config files
+and resources used as part of the bare metal cluster deployment. The
+following sub-sections show the config files and resources that are
+created and give a brief description of some of them. This will help you
+understand the technical details of the cluster deployment. You can also
+choose to skip this section, visit the next section about provisioning
+first and then revisit this.
+
+
Config Files and Resources Types
+
+
+
+
+
info “Information” Among these the config files are rendered under the
+path
+https://github.com/metal3-io/metal3-dev-env/tree/master/vm-setup/roles/v1aX_integration_test/files
+as part of the provisioning process.
+
+
+
A description of some of the files part of provisioning a cluster, in a
+centos-based environment:
+
+
+
+
+
+
+
Name
+
Description
+
Path
+
+
+
+
+
provisioning scripts
+
Scripts to trigger provisioning of cluster, control plane or worker
+
${metal3-dev-env}/scripts/provision/
+
+
+
deprovisioning scripts
+
Scripts to trigger deprovisioning of cluster, control plane or worker
Here are some of the resources that are created as part of provisioning :
+
+
+
+
+
Name
+
Description
+
+
+
+
+
Cluster
+
a Cluster API resource for managing a cluster
+
+
+
Metal3Cluster
+
Corresponding Metal3 resource generated as part of bare metal cluster deployment, and managed by Cluster
+
+
+
KubeadmControlPlane
+
Cluster API resource for managing the control plane, it also manages the Machine object, and has the KubeadmConfig
+
+
+
MachineDeployment
+
Cluster API resource for managing workers via MachineSet object, it can be used to add/remove workers by scaling Up/Down
+
+
+
MachineSet
+
Cluster API resource for managing Machine objects for worker nodes
+
+
+
Machine
+
Cluster API resource for managing nodes - control plane or workers. In case of Controlplane, its directly managed by KubeadmControlPlane, whereas for Workers it’s managed by a MachineSet
+
+
+
Metal3Machine
+
Corresponding Metal3 resource for managing bare metal nodes, it’s managed by a Machine resource
+
+
+
Metal3MachineTemplate
+
Metal3 resource which acts as a template when creating a control plane or a worker node
+
+
+
KubeadmConfigTemplate
+
A template of KubeadmConfig, for Workers, used to generate KubeadmConfig when a new worker node is provisioned
+
+
+
+
+
+
+
Note
The corresponding KubeadmConfig is copied to the control
+plane/worker at the time of provisioning.
+
+
+
+
Bare Metal Cluster Deployment
+
+
The deployment scripts primarily use ansible and the existing Kubernetes
+management cluster (based on minikube ) for deploying the bare-metal
+cluster. Make sure that some of the environment variables used for
+Metal³ deployment are set, if you didn’t use config_${user}.sh for
+setting the environment variables.
+
+
+
+
+
Parameter
+
Description
+
Default
+
+
+
+
+
CAPM3_VERSION
+
Version of Metal3 API
+
v1alpha3
+
+
+
POD_CIDR
+
Pod Network CIDR
+
192.168.0.0/18
+
+
+
CLUSTER_NAME
+
Name of bare metal cluster
+
test1
+
+
+
+
+
===
+
+
Steps Involved
+
+
All the scripts for cluster provisioning or de-provisioning are located
+at -
+${metal3-dev-env}/scripts/.
+The scripts call a common playbook which handles all the tasks that are
+available.
+
+
The steps involved in the process are:
+
+
+
The script calls an ansible playbook with necessary parameters ( from
+env variables and defaults )
There are
+templates
+in the role, which are used to render configurations in the Manifest
+directory. These configurations use kubeadm and are supplied to the
+Kubernetes module of ansible to create the cluster.
+
During provisioning, first the clusterctl env file is generated,
+then the cluster, control plane and worker definition templates for
+clusterctl are generated at
+${HOME}/.cluster-api/overrides/infrastructure-metal3/${CAPM3RELEASE}.
+
Using the templates generated in the previous step, the definitions
+for resources related to cluster, control plane and worker are
+rendered using clusterctl.
+
Centos or Ubuntu image is
+downloaded
+in the next step.
+
Finally using the above definitions, which are passed to the K8s
+module in ansible, the corresponding resource( cluster/control
+plane/worker ) is provisioned.
+
These same definitions are reused at the time of de-provisioning the
+corresponding resource, again using the K8s module in ansible
+
+
note “Note” The manifest directory is created when provisioning is
+triggered for the first time and is subsequently used to store the
+config files that are rendered for deploying the bare metal cluster.
+
+
+
+
+
+
+
Provision Cluster
+
+
This script, located at the path -
+${metal3-dev-env}/scripts/provision/cluster.sh, provisions the cluster
+by creating a Metal3Cluster and a Cluster resource.
+
+
To see if you have a successful Cluster resource creation( the cluster
+still doesn’t have a control plane or workers ), just do:
+
+
kubectl get Metal3Cluster ${CLUSTER_NAME}-n metal3
+
+
+
+
This will return the cluster deployed, and you can check the cluster
+details by describing the returned resource.
This script, located at the path -
+${metal3-dev-env}/scripts/provision/controlplane.sh, provisions the
+control plane member of the cluster using the rendered definition of the
+control plane explained in the Steps Involved section. The
+KubeadmControlPlane creates a Machine which picks up a BareMetalHost
+satisfying its requirements as the control plane node, and it is then
+provisioned by the Bare Metal Operator. A Metal3MachineTemplate
+resource is also created as part of the provisioning process.
+
+
Note
It takes some time for the provisioning of the control plane, you can
+watch the process using some steps shared a bit later
kubectl get Metal3MachineTemplate ${CLUSTER_NAME}-controlplane-n metal3
+
+
+
To track the progress of provisioning, you can try the following:
+
+
kubectl get BareMetalHosts -n metal3 -w
+
+
+
+
The BareMetalHosts resource is created when Metal³/metal3-dev-env
+was deployed. It is a kubernetes resource that represents a bare metal
+Machine, with all its details and configuration, and is managed by the
+Bare Metal Operator. You can also use the short representation
+instead, i.e. bmh ( short for BareMetalHosts) in the command
+above.
+You should see all the nodes that were created at the time of metal3
+deployment, along with their current status as the provisioning
+progresses
+
+
Note
All the bare metal hosts listed above were created when Metal³ was
+deployed in the detailed metal3-dev-env walkthrough blogpost.
+
+
+
+
kubectl get Machine -n metal3 -w
+
+
+
+
This shows the status of the Machine associated with the control plane
+and we can watch the status of provisioning under PHASE
+
+
+
Once the provisioning is finished, let’s get the host-ip:
+
+
sudo virsh net-dhcp-leases baremetal
+
+
+
Information
baremetal is one of the 2 networks that were created at the time of
+Metal3 deployment, the other being “provisioning” which is used - as
+you have guessed - for provisioning the bare metal cluster. More
+details about networking setup in the metal3-dev-env environment are
+described in the - detailed metal3-dev-env walkthrough
+blogpost.
+
+
+
+
You can log in to the control plane node if you want, and can check the
+deployment status using two methods.
The script is located at
+${metal3-dev-env-path}/scripts/provision/worker.sh and it provisions a
+node to be added as a worker to the bare metal cluster. It selects one
+of the remaining nodes and provisions it and adds it to the bare metal
+cluster ( which only has a control plane node at this point ). The
+resources created for workers are - MachineDeployment which can be
+scaled up to add more workers to the cluster and MachineSet which then
+creates a Machine managing the node.
+
+
Information
Similar to control plane provisioning, worker provisioning also takes
+some time, and you can watch the process using steps shared a bit
+later. This will also apply when you scale Up/Down workers at a later
+point in time.
To check the status we can follow steps similar to Controlplane case:
+
+
kubectl get bmh -n metal3 -w
+
+
+
+
We can see the live status of the node being provisioned. As mentioned
+before bmh is the short representation of BareMetalHosts.
+
+
+
kubectl get Machine -n metal3 -w
+
+
+
+
This shows the status of Machines associated with workers, apart from
+the one for Controlplane, and we can watch the status of provisioning
+under PHASE
+
+
+
sudo virsh net-dhcp-leases baremetal
+
+
+
+
To get the node’s IP
+
+
+
ssh metal3@{control-plane-node-ip}
+kubectl get nodes
+
We can add or remove workers to the cluster, and we can scale up the
+MachineDeployment up or down, in this example we are adding 2 more
+worker nodes, making the total nodes = 3
+
+
+
Deprovisioning
+
+
All of the previous components have corresponding de-provisioning
+scripts which use config files, in the previously mentioned manifest
+directory, and use them to clean up the worker, control plane and
+cluster.
+
+
This step will use the already generated cluster/control plane/worker
+definition file, and supply it to Kubernetes ansible module to
+remove/de-provision the resource. You can find it, under the Manifest
+directory, in the Snapshot shared at the beginning of this blogpost
+where we show the file structure.
+
+
For example, if you wish to de-provision the cluster, you would do:
+
+
sh ${metal3-dev-env-path}/scripts/deprovision/worker.sh
+sh ${metal3-dev-env-path}/scripts/deprovision/controlplane.sh
+sh ${metal3-dev-env-path}/scripts/deprovision/cluster.sh
+
+
+
Note
The reason for running the deprovision/worker.sh and
+deprovision/controlplane.sh scripts is that not all objects are
+cleared when we just run the deprovision/cluster.sh script.
+Following this, if you want to de-provision the control plane it is
+recommended to de-provision the cluster itself since we can’t
+provision a new control plane with the same cluster. For worker
+de-provisioning, we only need to run the worker script.
+
+
+
+
The following video demonstrates all the steps to provision and
+de-provision a Kubernetes cluster explained above.
+
+
+
+
+
+
+
+
Summary
+
+
In this blogpost we saw how to deploy a bare metal cluster once we have
+a Metal³(metal3-dev-env repo) deployed and by that point we will already
+have the nodes ready to be used for a bare metal cluster deployment.
+
+
In the first section, we show the various configuration files,
+templates, resource types and their meanings. Then we see the common
+steps involved in the provisioning process. After that, we see a general
+overview of how all resources are related and at what point are they
+created - provision cluster/control plane/worker.
+
+
In each of the provisioning sections, we see the steps to monitor the
+provisioning and how to confirm if it’s successful or not, with brief
+explanations wherever required. Finally, we see the de-provisioning
+section which uses the resource definitions generated at the time of
+provisioning to de-provision cluster, control plane or worker.
+
+
Here are a few resources which you might find useful if you want to
+explore further, some of them have already been shared earlier.
Metal3 supports multiple types of images for deployment, the most
+popular being QCOW2. We have recently added support for a feature of Ironic
+that improves deployments on constrained environments, raw image streaming.
+We’ll first dive into how Ironic deploys the images on the target hosts, and
+how raw image streaming improves this process. Afterwards, we will point out
+the changes to take this into use in Metal3.
+
+
Image deployments with Ironic
+
+
In Metal3, the image deployment is performed by the Ironic Python Agent (IPA)
+image running on the target host. In order to deploy an image, Ironic will
+first boot the target node with an IPA image over iPXE. IPA will run in memory.
+
+
Once IPA runs on the target node, Ironic will instruct it to download the
+target image. In Metal3, we use HTTP(S) for the download of the image. IPA will
+download the image and, depending on the format of the image, prepare it to
+write on the disk. This means that the image is downloaded in memory and
+decompressed, two steps that can be both time and memory consuming.
+
+
In order to improve this process, Ironic implemented a feature called raw image
+streaming.
+
+
What is raw image streaming?
+
+
The target image format when writing to disk is raw. That’s why the images in
+formats like QCOW2 must be processed before being written to disk. However, if
+the image that is downloaded is already in raw format, then no processing is
+needed.
+
+
Ironic leverages this, and instead of first downloading the image and then
+processing it before writing it to disk, it will directly write the
+downloaded image to the disk. This feature is known as image streaming.
+Image streaming can only be performed with images in raw format.
+
+
Since the downloaded image when streamed is directly written to disk, the
+memory size requirements change. For any other format than raw, the target
+host needs to have sufficient memory to both run IPA (4GB) and
+download the image in memory. However, with raw images, the only constraint
+on memory is to run IPA (so 4GB). For example, in order to deploy an Ubuntu
+image (around 700MB, QCOW2), the requirement is 8GB when in QCOW2 format, while
+it is only 4GB (as for any other image) when streamed as raw. This allows
+the deployment of images that are bigger than the available memory on
+constrained nodes.
+
+
However, this shifts the load on the network, since the raw images are usually
+much bigger than other formats. Using this feature in network constrained
+environment is not recommended.
+
+
Raw image streaming in Metal3
+
+
In order to use raw image streaming in Metal3, a couple of steps are needed.
+The first one is to convert the image to raw and make it available in an
+HTTP server. This can be achieved by running :
+
+
qemu-img convert -O raw "${IMAGE_NAME}""${IMAGE_RAW_NAME}"
+
+
+
Once converted the image format needs to be provided to Ironic through the
+BareMetalHost (BMH) image spec field. If not provided, Ironic will assume that
+the format is unspecified and download it in memory first.
+
+
The following is an example of the BMH image spec field in Metal3 Dev Env.
If deploying with Cluster API provider Metal3 (CAPM3), CAPM3 takes care of
+setting the image field of BMH properly, based on the image field values in
+the Metal3Machine (M3M), which might be based on a Metal3MachineTemplate (M3MT).
+So in order to use raw image streaming, the format of the image must be
+provided in the image spec field of the Metal3Machine or Metal3MachineTemplate.
+
+
The following is an example of the M3M image spec field in metal3-dev-env :
This will enable raw image streaming. By default, metal3-dev-env uses the raw image
+streaming, in order to minimize the resource requirements of the environment.
+
+
In a nutshell
+
+
With the addition of raw image streaming, Metal3 now supports a wider range of
+hardware, specifically, the memory-constrained nodes and speeds up deployments.
+Metal3 still supports all the other formats it supported until now. This new
+feature changes the way raw images are deployed for better efficiency.
As a part of developing the Cluster API Provider Metal3 (CAPM3) v1alpha4
+release, the Metal3 crew introduced a new project: its own IP Address Manager.
+This blog post will go through the motivations behind such a project, the
+features that it brings, its use in Metal3 and future work.
+
+
What is the IP Address Manager?
+
+
The IP Address Manager (IPAM) is a controller that provides IP addresses and
+manages the allocations of IP subnets. It is not a DHCP server in that it only
+reconciles Kubernetes objects and does not answer any DHCP queries. It
+allocates IP addresses on request but does not handle any use of those
+addresses.
+
+
This sounds like the description of any IPAM system, no? Well, the twist
+is that this manager is based on Kubernetes to specifically handle some
+constraints from Metal3. We will go through the different issues that this
+project tackles.
+
+
When deploying nodes in a bare metal environment, there are a lot of possible
+variations. This project specifically aims to solve cases where static
+IP address configurations are needed. It is designed to specifically address
+this in the Cluster API (CAPI) context.
+
+
CAPI addresses the deployment of Kubernetes clusters and nodes, using
+the Kubernetes API. As such, it uses objects such as Machine Deployments
+(similar to deployments for pods) that takes care of creating the requested
+number of machines, based on templates. The replicas can be increased by the
+user, triggering the creation of new machines based on the provided templates.
+This mechanism does not allow for flexibility to be able to provide static
+addresses for each machine. The manager adds this flexibility by providing
+the address right before provisioning the node.
+
+
In addition, all the resources from the source cluster must support the CAPI
+pivoting, i.e. being copied and recreated in the target cluster. This means
+that all objects must contain all needed information in their spec field to
+recreate the status in the target cluster without losing information. All
+objects must, through a tree of owner references, be attached to the cluster
+object, for the pivoting to proceed properly.
+
+
In a nutshell, the manager provides an IP Address allocation service, based
+on Kubernetes API and fulfilling the needs of Metal3, specifically the
+requirements of CAPI.
+
+
How does it work?
+
+
The manager follows the same logic as the volume allocation in Kubernetes,
+with a claim and an object created for that claim. There are three types of
+objects defined, the IPPool, the IPClaim and the IPAddress objects.
+
+
The IPPool objects contain the definition of the IP subnets from which the
+Addresses are allocated. It supports both IPv4 and IPv6. The subnets can either
+be defined as such or given as start and end IP addresses with a prefix.
+It also supports pre-allocating IP addresses.
Whenever something requires an IP address from the IPPool, it will create an
+IPClaim. The IPClaim contains a pointer to the IPPool and an owner reference
+to the object that created it.
The controller will then reconcile this object and allocate an IP address. It
+will create an IPAddress object representing the allocated address. It will
+then update the IPPool status to list the IP Address and the IPClaim status
+to point to the IPAddress.
The IP Address Manager is used in Metal3 together with the metadata and network
+data templates feature. Each Metal3Machine (M3M) and Metal3MachineTemplate
+(M3MT) is associated with a Metal3DataTemplate that contains metadata and /
+or a network data template that will be rendered for each Metal3Machine. The
+rendered data will then be provided to Ironic. Those templates reference
+IPPool objects. For each Metal3Machine, an IPClaim is created for each
+IPPool, and the templates are rendered with the allocated IPAddress.
+
+
This is how we achieve dynamic IP Address allocations in setups that
+require static configuration, allowing us to use Machine Deployment and Kubeadm
+Control Plane objects from CAPI in hardware labs where DHCP is not supported.
+
+
Since each IPAddress has an owner reference set to its IPClaim object, and
+IPClaim objects have an owner reference set to the Metal3Data object created
+from the Metal3DataTemplate, the owner reference chain links a Metal3Machine to
+all the IPClaim and IPAddress objects were created for it, allowing for CAPI
+pivoting.
+
+
What now?
+
+
The project is fulfilling its basic requirements, but we are looking into
+extending it and covering more use cases. For example, we are looking at
+adding integration with Infoblox and other external IPAM services. Do not
+hesitate to open an issue if you have some ideas for new features!
Metal3 project has introduced pivoting in its CI workflow. The motivation for
+pivoting is to move all the objects from the ephemeral/management
+cluster to a target cluster. This blog post will briefly introduce the concept
+of pivoting and the impact it has on the overall CI workflow. For the rest of
+this blog, we refer ephemeral/management cluster as an ephemeral cluster.
+
+
What is Pivoting?
+
+
In the context of Metal3 Provider, Pivoting is the process of moving
+Cluster-API and Metal3 objects from the ephemeral k8s cluster to a target
+cluster. In Metal3, this process is performed using the
+clusterctl tool
+provided by Cluster-API. clusterctl recognizes pivoting as a move. During the
+pivot process, clusterctl pauses any reconciliation of Cluster-API objects and
+this gets propagated to Cluster-api-provider-metal3 (CAPM3) objects as well.
+Once all the objects are paused, the objects are created on the other side on
+the target cluster and deleted from the ephemeral cluster.
+
+
Prerequisites
+
+
Prior to the actual pivot process, the target cluster should already have the
+provider components, ironic containers and CNI installed and running. To perform
+pivot outside metal3-dev-env, specifically, the following points need to be
+addressed:
+
+
+
clusterctl is used to initialize both the ephemeral and target cluster.
+
BMH objects have correct status annotation.
+
Maintain connectivity towards the provisioning network.
+
Baremetal Operator(BMO) is deployed as part of CAPM3.
+
Objects should have a proper owner reference chain.
+
+
+
For a detailed explanation of the above-mentioned prerequisites please read the
+pivoting documentation.
+
+
Pivoting workflow in CI
+
+
The Metal3 CI currently includes pivoting as part of the deployment
+process both for Ubuntu and CentOS-based jobs. This essentially means all
+the PRs that go in, are tested through the pivoting workflow. Here is the
+CI deployment workflow:
+
+
+
make the metal3-dev-env.
+It gives us the ephemeral cluster with all the necessary controllers running
+within it. The corresponding metal3-dev-env command is make
+
provision target cluster. For normal integration tests, this step deploys
+a control-plane node and a worker in the target cluster. For, feature-test
+and feature-test-upgrade the provision step deploys three control-planes and
+a worker. The corresponding metal3-dev-env commands are (normal integration
+test workflow):
Initialize the provider components on the target cluster. This installs all
+the controllers and associated components related to cluster-api ,
+cluster-api-provider-metal3, baremetal-operator and ironic. Since it is
+necessary to have only one set of ironic deployment/containers in the picture,
+this step also deletes the ironic deployment/containers from
+ephemeral cluster.
+
Move all the objects from ephemeral to the target cluster.
+
Check the status of the objects to verify whether the objects are being
+reconciled correctly by the controllers in the target cluster. This step
+verifies and finalizes the pivoting process. The corresponding metal3-dev-env
+the command that performs this and the previous two steps is :
+
+
+
./scripts/feature_tests/pivoting/pivot.sh
+
+
+
+
Move the objects back to the ephemeral cluster. This step also
+removes the ironic deployment from the target cluster and reinstates the
+ironic deployment/containers in the ephemeral cluster. Since we do
+not delete the provider components in the ephemeral cluster,
+installing them again is not necessary. The corresponding metal3-dev-env command
+that performs this step is :
+
+
+
./scripts/feature_tests/pivoting/repivot.sh
+
+
+
+
De-provision the BMHs and delete the target cluster. The corresponding
+metal3-dev-env commands to de-provision worker, controlplane and the cluster
+is as follows:
Note that, if we de-provision cluster, that would de-provision worker and
+controlplane automatically.
+
+
Pivoting in Metal3
+
+
The pivoting process described above is realized in ansible scripts
+move.yml
+and
+move_back.yml.
+Under the hood, pivoting uses the move command from
+clusterctl
+provided by Cluster-API.
+
+
As stated earlier, all the PRs that go into any Metal3 repository where the
+integration tests are run, the code change introduced in the PR is verified with
+pivoting also in the integration tests now. Moreover, the upgrade workflow in
+Metal3 performs all the upgrade operations in Metal3 after pivoting to the
+target cluster.
Running on bare metal has both benefits and drawbacks. You can get the
+best performance possible out of the hardware, but it can also be quite
+expensive and maybe not necessary for all workloads. Perhaps a hybrid
+cluster could give you the best of both? Raw power for the workload that
+needs it, and cheap virtualized commodity for the rest. This blog post
+will show how to set up a cluster like this using the Cluster API backed
+by the Metal3 and BYOH providers.
+
+
The problem
+
+
Imagine that you have some bare metal servers that you want to use for
+some specific workload. Maybe the workload benefits from the specific
+hardware or there are some requirements that make it necessary to run it
+there. The rest of the organization already uses Kubernetes and the
+cluster API everywhere so of course you want the same for this as well.
+Perfect, grab Metal³ and start working!
+
+
But hold on, this would mean that you use some of the servers for
+running the Kubernetes control plane and possibly all the cluster API
+controllers. If there are enough servers this is probably not an issue,
+but do you really want to “waste” these servers on such generic
+workloads that could be running anywhere? This can become especially
+painful if you need multiple control plane nodes. Each server is
+probably powerful enough to run all the control planes and controllers,
+but it would be a single point of failure…
+
+
What if there was a way to use a different cluster API infrastructure
+provider for some nodes? For example, use the Openstack infrastructure
+provider for the control plane and Metal³ for the workers. Let’s do an
+experiment!
+
+
Setting up the experiment environment
+
+
This blog post will use the Bring your own
+host
+(BYOH) provider together with Metal³ as a proof of concept to show what
+is currently possible.
+
+
The BYOH provider was chosen as the second provider for two reasons:
+
+
+
Due to its design (you provision the host yourself), it is very easy
+to adapt it to the test (e.g. use a VM in the same network that the
+metal3-dev-env uses).
+
It is one of the providers that is known to work when combining
+multiple providers for a single cluster.
+
+
+
We will be using the
+metal3-dev-env on Ubuntu
+as a starting point for this experiment. Note that it makes substantial
+changes to the machine where it is running, so you may want to use a
+dedicated lab machine instead of your laptop for this. If you have not
+done so already, clone it and run make. This should give you a
+management cluster with the Metal³ provider installed and two
+BareMetalHosts ready for provisioning.
+
+
The next step is to add the BYOH provider and a ByoHost.
+
+
clusterctl init --infrastructure byoh
+
+
+
For the ByoHost we will use Vagrant.
+You can install it with sudo apt install vagrant.
+Then copy the Vagrantfile below to a new folder and run vagrant up.
+
+
# -*- mode: ruby -*-
+hosts = {
+ "control-plane1" => { "memory" => 2048, "ip" => "192.168.10.10"},
+ # "control-plane2" => { "memory" => 2048, "ip" => "192.168.10.11"},
+ # "control-plane3" => { "memory" => 2048, "ip" => "192.168.10.12"},
+}
+
+
+Vagrant.configure("2") do |config|
+ # Choose which box you want below
+ config.vm.box = "generic/ubuntu2004"
+ config.vm.synced_folder ".", "/vagrant", disabled: true
+ config.vm.provider :libvirt do |libvirt|
+ # QEMU system connection is required for private network configuration
+ libvirt.qemu_use_session = false
+ end
+
+
+ # Loop over all machine names
+ hosts.each_key do |host|
+ config.vm.define host, primary: host == hosts.keys.first do |node|
+ node.vm.hostname = host
+ node.vm.network :private_network, ip: hosts[host]["ip"],
+ libvirt__forward_mode: "route"
+ node.vm.provider :libvirt do |lv|
+ lv.memory = hosts[host]["memory"]
+ lv.cpus = 2
+ end
+ end
+ end
+end
+
+
+
Vagrant should now have created a new VM to use as a ByoHost. Now we
+just need to run the BYOH agent in the VM to make it register as a
+ByoHost in the management cluster. The BYOH agent needs a kubeconfig
+file to do this, so we start by copying it to the VM:
+
+
+
+
cp ~/.kube/config ~/.kube/management-cluster.conf
+# Ensure that the correct IP is used (not localhost)
+export KIND_IP=$(docker inspect -f'{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' kind-control-plane)
+sed-i's/ server\:.*/ server\: https\:\/\/'"$KIND_IP"'\:6443/g' ~/.kube/management-cluster.conf
+scp -i .vagrant/machines/control-plane1/libvirt/private_key \
+ /home/ubuntu/.kube/management-cluster.conf vagrant@192.168.10.10:management-cluster.conf
+
+
+
+
+
+
Next, install the prerequisites and host agent in the VM and run it.
You should now have a management cluster with both the Metal³ and BYOH
+providers installed, as well as two BareMetalHosts and one ByoHost.
+
+
$kubectl -n metal3 get baremetalhosts,byohosts
+NAME STATE CONSUMER ONLINE ERROR AGE
+baremetalhost.metal3.io/node-0 available true 18m
+baremetalhost.metal3.io/node-1 available true 18m
+
+
+NAME AGE
+byohost.infrastructure.cluster.x-k8s.io/control-plane1 73s
+
+
+
Creating a multi-provider cluster
+
+
The trick is to create both a Metal3Cluster and a ByoCluster that are
+owned by one common Cluster. We will use the ByoCluster for the control
+plane in this case. First the Cluster:
Add the rest of the BYOH manifests to get a control plane.
+The code is collapsed here for easier reading.
+Please click on the line below to expand it.
So far this is a “normal” Cluster backed by the BYOH provider. But now
+it is time to do something different. Instead of adding more ByoHosts as
+workers, we will add a Metal3Cluster and MachineDeployment backed by
+BareMetalHosts! Note that the controlPlaneEndpoint of the
+Metal3Cluster must point to the same endpoint that the ByoCluster is
+using.
These manifests are quite large but they are just the same as would be
+used by the metal3-dev-env with some name changes here and there. The
+key thing to note is that all references to a Cluster are to the one we
+defined above. Here is the MachineDeployment:
Finally, we add the Metal3MachineTemplate, Metal3DataTemplate and
+KubeadmConfigTemplate. Here you may want to add your public ssh key in
+the KubeadmConfigTemplate (the last few lines).
$kubectl get nodes
+NAME STATUS ROLES AGE VERSION
+control-plane1 Ready control-plane,master 88m v1.23.5
+test1-8767dbccd-24cl5 Ready <none>82m v1.23.5
+
+
+
Going back to the management cluster, we can inspect the state of the
+cluster API resources.
As we have seen in this post, it is possible to combine at least some
+infrastructure providers when creating a single cluster. This can be
+useful for example if a provider has a high cost or limited resources.
+Furthermore, the use case is not addressed by MachineDeployments since
+they would all be from the same provider (even though they can have
+different properties).
+
+
There is some room for development and improvement though. The most
+obvious thing is perhaps that Clusters only have one
+infrastructureRef. This means that the cluster API controllers are not
+aware of the “secondary” infrastructure provider(s).
+
+
Another thing that may be less obvious is the reliance on Nodes and
+Machines in the Kubeadm control plane provider. It is not an issue in
+the example we have seen here since both Metal³ and BYOH creates Nodes.
+However, there are some projects where Nodes are unnecessary. See for
+example Kamaji, which aims to
+integrate with the cluster API. The idea here is to run the control
+plane components in the management cluster as Pods. Naturally, there
+would not be any control plane Nodes or Machines in this case. (A second
+provider would be used to add workers.) But the Kubeadm control plane
+provider expects there to be both Machines and Nodes for the control
+plane, so a new provider is likely needed to make this work as desired.
+
+
This issue can already be seen in the
+vcluster
+provider, where the Cluster stays in Provisioning state because it is
+“Waiting for the first control plane machine to have its
+status.nodeRef set”. The idea with vcluster is to reuse the Nodes of
+the management cluster but provide a separate control plane. This gives
+users better isolation than just namespaces without the need for another
+“real” cluster. It is for example possible to have different custom
+resource definitions in each vcluster. But since vcluster runs all the
+pods (including the control plane) in the management cluster, there will
+never be a control plane Machine or nodeRef.
+
+
There is already one implementation of a control plane provider without
+Nodes, i.e. the EKS provider. Perhaps this is the way forward. One
+implementation for each specific case. It would be nice if it was
+possible to do it in a more generic way though, similar to how the
+Kubeadm control plane provider is used by almost all infrastructure
+providers.
+
+
To summarize, there is already some support for mixed clusters with
+multiple providers. However, there are some issues that make it
+unnecessarily awkward. Two things that could be improved in the cluster
+API would be the following:
+
+
+
Make the cluster.infrastructureRef into a list to allow multiple
+infrastructure providers to be registered.
+
Drop the assumption that there will always be control plane Machines
+and Nodes (e.g. by implementing a new control plane provider).
We want to ensure that Metal3 can scale to thousands of nodes and clusters.
+However, running tests with thousands of real servers is expensive and we don’t have access to any such large environment in the project.
+So instead we have been focusing on faking the hardware while trying to keep things as realistic as possible for the controllers.
+In this first part we will take a look at the Bare Metal Operator and the test mode it offers.
+The next part will be about how to fake the Kubernetes API of the workload clusters.
+In the final post we will take a look at the issues we ran into and what is being done in the community to address them so that we can keep scaling!
+
+
Some background on how to fool the controllers
+
+
With the full Metal3 stack, from Ironic to Cluster API, we have the following controllers that operate on Kubernetes APIs:
+
+
+
Cluster API Kubeadm control plane controller
+
Cluster API Kubeadm bootstrap controller
+
Cluster API controller
+
Cluster API provider for Metal3 controller
+
IP address manager controller
+
Bare Metal Operator controller
+
+
+
We will first focus on the controllers that interact with Nodes, Machines, Metal3Machines and BareMetalHosts, i.e. objects related to actual physical machines that we need to fake.
+In other words, we are skipping the IP address manager for now.
+
+
What do these controllers care about really?
+What do we need to do to fool them?
+At the Cluster API level, the controllers just care about the Kubernetes resources in the management cluster (e.g. Clusters and Machines) and some resources in the workload cluster (e.g. Nodes and the etcd Pods).
+The controllers will try to connect to the workload clusters in order to check the status of the resources there, so if there is no real workload cluster, this is something we will need to fake if we want to fool the controllers.
+When it comes to Cluster API provider for Metal3, it connects the abstract high level objects with the BareMetalHosts, so here we will need to make the BareMetalHosts to behave realistically in order to provide a good test.
+
+
This is where the Bare Metal Operator test mode comes in.
+If we can fake the workload cluster API and the BareMetalHosts, then all the Cluster API controllers and the Metal3 provider will get a realistic test that we can use when working on scalability.
+
+
Bare Metal Operator test mode
+
+
The Bare Metal Operator has a test mode, in which it doesn’t talk to Ironic.
+Instead it just pretends that everything is fine and all actions succeed.
+In this mode the BareMetalHosts will move through the state diagram just like they normally would (but quite a bit faster).
+To enable it, all you have to do is add the -test-mode flag when running the Bare Metal Operator controller.
+For convenience there is also a make target (make run-test-mode) that will run the Bare Metal Operator directly on the host in test mode.
+
+
Here is an example of how to use it.
+You will need kind and kubectl installed for this to work, but you don’t need the Bare Metal Operator repository cloned.
+
+
+
+
Create a kind cluster and deploy cert-manager (needed for web hook certificates):
# Create the namespace where it will run
+kubectl create ns baremetal-operator-system
+# Deploy it in normal mode
+kubectl apply -k https://github.com/metal3-io/baremetal-operator/config/default
+# Patch it to run in test mode
+kubectl patch -n baremetal-operator-system deploy baremetal-operator-controller-manager --type=json \
+ -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--test-mode"}]'
+
+
+
+
In a separate terminal, create a BareMetalHost from the example manifests:
After applying the BareMetalHost, it will quickly go through registering and become available.
+
+
$kubectl get bmh
+NAME STATE CONSUMER ONLINE ERROR AGE
+example-baremetalhost registering true 2s
+$kubectl get bmh
+NAME STATE CONSUMER ONLINE ERROR AGE
+example-baremetalhost available true 6s
+
+
+
We can now provision the BareMetalHost, turn it off, deprovision, etc.
+Just like normal, except that the machine doesn’t exist.
+Let’s try provisioning it!
You will see it go through provisioning and end up in provisioned state:
+
+
$kubectl get bmh
+NAME STATE CONSUMER ONLINE ERROR AGE
+example-baremetalhost provisioning true 7m20s
+
+$kubectl get bmh
+NAME STATE CONSUMER ONLINE ERROR AGE
+example-baremetalhost provisioned true 7m22s
+
+
+
Wrapping up
+
+
With Bare Metal Operator in test mode, we have the foundation for starting our scalability journey.
+We can easily create BareMetalHost objects and they behave similar to what they would in a real scenario.
+A simple bash script will at this point allow us to create as many BareMetalHosts as we would like.
+To wrap things up, we will now do just that: put together a script and try generating a few BareMetalHosts.
+
+
The script will do the same thing we did before when creating the example BareMetalHost, but it will also give them different names so we don’t get naming collisions.
+Here it is:
Save it as produce-available-hosts.sh and try it out:
+
+
$./produce-available-hosts.sh 10 | kubectl apply -f -
+secret/worker-1-bmc-secret created
+baremetalhost.metal3.io/worker-1 created
+secret/worker-2-bmc-secret created
+baremetalhost.metal3.io/worker-2 created
+secret/worker-3-bmc-secret created
+baremetalhost.metal3.io/worker-3 created
+secret/worker-4-bmc-secret created
+baremetalhost.metal3.io/worker-4 created
+secret/worker-5-bmc-secret created
+baremetalhost.metal3.io/worker-5 created
+secret/worker-6-bmc-secret created
+baremetalhost.metal3.io/worker-6 created
+secret/worker-7-bmc-secret created
+baremetalhost.metal3.io/worker-7 created
+secret/worker-8-bmc-secret created
+baremetalhost.metal3.io/worker-8 created
+secret/worker-9-bmc-secret created
+baremetalhost.metal3.io/worker-9 created
+secret/worker-10-bmc-secret created
+baremetalhost.metal3.io/worker-10 created
+$kubectl get bmh
+NAME STATE CONSUMER ONLINE ERROR AGE
+worker-1 registering true 2s
+worker-10 available true 2s
+worker-2 available true 2s
+worker-3 available true 2s
+worker-4 available true 2s
+worker-5 available true 2s
+worker-6 registering true 2s
+worker-7 available true 2s
+worker-8 available true 2s
+worker-9 available true 2s
+
+
+
With this we conclude the first part of the scaling series.
+In the next post, we will take a look at how to fake the other end of the stack: the workload cluster API.
In part 1, we introduced the Bare Metal Operator test mode and saw how it can be used to play with BareMetalHosts without Ironic and without any actual hosts.
+Now we will take a look at the other end of the stack and how we can fake the workload cluster API’s.
+
+
Test setup
+
+
The end goal is to have one management cluster where the Cluster API and Metal3 controllers run.
+In this cluster we would generate BareMetalHosts and create Clusters, Metal3Clusters, etc to benchmark the controllers.
+To give them a realistic test, we also need to fake the workload cluster API’s.
+These will run separately in “backing” clusters to avoid interfering with the test (e.g. by using up all the resources in the management cluster).
+Here is a diagram that describes the setup:
+
+
+
+
How are we going to fake the workload cluster API’s then?
+The most obvious solution is to just run the real deal, i.e. the kube-apiserver.
+This is what would be run in a real workload cluster, together with the other components that make up the Kubernetes control plane.
+
+
If you want to follow along and try to set this up yourself, you will need at least the following tools installed:
This has been tested with Kubernetes v1.25, kind v0.19 and clusterctl v1.4.2.
+All script snippets are assumed to be for the bash shell.
+
+
Running the Kubernetes API server
+
+
There are many misconceptions, maybe even superstitions, about the Kubernetes control plane.
+The fact is that it is in no way special.
+It consists of a few programs that can be run in any way you want: in a container, as a systemd unit or directly executed at the command line.
+They can run on a Node or outside of the cluster.
+You can even run multiple instances on the same host as long as you avoid port collisions.
+
+
For our purposes we basically want to run as little as possible of the control plane components.
+We just need the API to be available and possible for us to populate with data that the controllers expect to be there.
+In other words, we need the API server and etcd.
+The scheduler is not necessary since we won’t run any actual workload (we are just pretending the Nodes are there anyway) and the controller manager would just get in the way when we want to fake resources.
+It would, for example, try to update the status of the (fake) Nodes that we want to create.
+
+
The API server will need an etcd instance to connect to.
+It will also need some TLS configuration, both for connecting to etcd and for handling service accounts.
+One simple way to generate the needed certificates is to use kubeadm.
+But before we get there we need to think about how the configuration should look like.
+
+
For simplicity, we will simply run the API server and etcd in a kind cluster for now.
+It would then be easy to run them in some other Kubernetes cluster later if needed.
+Let’s create it right away:
+
+
kind create cluster
+# Note: This has been tested with node image
+# kindest/node:v1.26.3@sha256:61b92f38dff6ccc29969e7aa154d34e38b89443af1a2c14e6cfbd2df6419c66f
+
+
+
To try to cut down on the resources required, we will also use a single multi-tenant etcd instance instead of one per API server.
+We can rely on the internal service discovery so the API server can find etcd via an address like etcd-server.etd-system.svc.cluster.local, instead of using IP addresses.
+Finally, we will need an endpoint where the API is exposed to the cluster where the controllers are running, but for now we can focus on just getting it up and running with 127.0.0.1:6443 as the endpoint.
+
+
Based on the above, we can create a kubeadm-config.yaml file like this:
As mentioned before, we want to create a multi-tenant etcd that many API servers can share.
+For this reason, we will need to create a root user and enable authentication for etcd:
At this point we have a working etcd instance with authentication and TLS enabled.
+Each client will need to have an etcd user to interact with this instance so we need to create an etcd user for the API server.
+We already created a root user before so this should look familiar.
+
+
## Create etcd tenant
+# Create user
+kubectl -n etcd-system exec etcd-0 -- etcdctl --user root:rootpw \
+ --key=/etc/kubernetes/pki/etcd/tls.key --cert=/etc/kubernetes/pki/etcd/tls.crt --cacert /etc/kubernetes/pki/ca/tls.crt \
+ user add test--new-user-password=test
+# Create role
+kubectl -n etcd-system exec etcd-0 -- etcdctl --user root:rootpw \
+ --key=/etc/kubernetes/pki/etcd/tls.key --cert=/etc/kubernetes/pki/etcd/tls.crt --cacert /etc/kubernetes/pki/ca/tls.crt \
+ role add test
+# Add read/write permissions for prefix to the role
+kubectl -n etcd-system exec etcd-0 -- etcdctl --user root:rootpw \
+ --key=/etc/kubernetes/pki/etcd/tls.key --cert=/etc/kubernetes/pki/etcd/tls.crt --cacert /etc/kubernetes/pki/ca/tls.crt \
+ role grant-permission test--prefix=true readwrite "/test/"
+# Give the user permissions from the role
+kubectl -n etcd-system exec etcd-0 -- etcdctl --user root:rootpw \
+ --key=/etc/kubernetes/pki/etcd/tls.key --cert=/etc/kubernetes/pki/etcd/tls.crt --cacert /etc/kubernetes/pki/ca/tls.crt \
+ user grant-role test test
+
+
+
From etcd’s point of view, everything is now ready.
+The API server could theoretically use etcdctl and authenticate with the username and password that we created for it.
+However, that is not how the API server works.
+It expects to be able to authenticate using client certificates.
+Luckily, etcd supports this so we just have to generate the certificates and sign them so that etcd trusts them.
+The key thing is to set the common name in the certificate to the name of the user we want to authenticate as.
+
+
Since kubeadm always sets the same common name, we will here use openssl to generate the client certificates so that we get control over it.
In order to deploy the API server, we will first need to generate some more certificates.
+The client certificates for connecting to etcd are already ready, but it also needs certificates to secure the exposed API itself, and a few other things.
+Then we will also need to create secrets from all of these certificates:
Time to check if it worked!
+We can use port-forwarding to access the API, but of course we will need some authentication method for it to be useful.
+With kubeadm we can generate a kubeconfig based on the certificates we already have.
Note that it won’t have any Nodes or Pods running.
+It is completely empty since it is running on its own.
+There is no kubelet that registered as a Node or applied static manifests, there is no scheduler or controller manager.
+Exactly like we want it.
+
+
Faking Nodes and other resources
+
+
Let’s take a step back and think about what we have done so far.
+We have deployed a Kubernetes API server and a multi-tenant etcd instance.
+More API servers can be added in the same way, so it is straight forward to scale.
+All of it runs in a kind cluster, which means that it is easy to set up and we can switch to any other Kubernetes cluster if needed later.
+Through Kubernetes we also get an easy way to access the API servers by using port-forwarding, without exposing all of them separately.
+
+
The time has now come to think about what we need to put in the workload cluster API to convince the Cluster API and Metal3 controllers that it is healthy.
+First of all they will expect to see Nodes that match the Machines and that they have a provider ID set.
+Secondly, they will expect to see healthy control plane Pods.
+Finally, they will try to check on the etcd cluster.
+
+
The final point is a problem, but we can work around it for now by configuring external etcd.
+It will lead to a different code path for the bootstrap and control plane controllers, but until we have something better it will be a good enough test.
+
+
Creating the Nodes and control plane Pods is really easy though.
+We are just adding resources and there are no controllers or validating web hooks that can interfere.
+Try it out!
+
+
# Create a Node
+kubectl --kubeconfig=kubeconfig.yaml create -f https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/fake-node.yaml
+# Check that it worked
+kubectl --kubeconfig=kubeconfig.yaml get nodes
+# Maybe label it as part of the control plane?
+kubectl --kubeconfig=kubeconfig.yaml label node fake-node node-role.kubernetes.io/control-plane=""
+
+
+
Now add a Pod:
+
+
kubectl --kubeconfig=kubeconfig.yaml create -f https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/kube-apiserver-pod.yaml
+# Set status on the pods (it is not added when using create/apply).
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/kube-apiserver-pod-status.yaml |
+ kubectl --kubeconfig=kubeconfig.yaml -n kube-system patch pod kube-apiserver-node-name \
+ --subresource=status --patch-file=/dev/stdin
+
+
+
You should be able to see something like this:
+
+
$kubectl --kubeconfig kubeconfig.yaml get pods -A
+NAMESPACE NAME READY STATUS RESTARTS AGE
+kube-system kube-apiserver-node-name 1/1 Running 0 16h
+$kubectl --kubeconfig kubeconfig.yaml get nodes
+NAME STATUS ROLES AGE VERSION
+fake-node Ready <none>16h v1.25.3
+
+
+
Now all we have to do is to ensure that the API returns information that the controllers expect.
+
+
Hooking up the API server to a Cluster API cluster
+
+
We will now set up a fresh cluster where we can run the Cluster API and Metal3 controllers.
+
+
# Delete the previous cluster
+kind delete cluster
+# Create a fresh new cluster
+kind create cluster
+# Initialize Cluster API with Metal3
+clusterctl init --infrastructure metal3
+## Deploy the Bare Metal Opearator
+# Create the namespace where it will run
+kubectl create ns baremetal-operator-system
+# Deploy it in normal mode
+kubectl apply -k https://github.com/metal3-io/baremetal-operator/config/default
+# Patch it to run in test mode
+kubectl patch -n baremetal-operator-system deploy baremetal-operator-controller-manager --type=json \
+ -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--test-mode"}]'
+
+
+
You should now have a cluster with the Cluster API, Metal3 provider and Bare Metal Operator running.
+Next, we will prepare some files that will come in handy later, namely a cluster template, BareMetalHost manifest and Kubeadm configuration file.
With this we have enough to start creating the workload cluster.
+First, we need to set up some certificates.
+This should look very familiar from earlier when we created certificates for the Kubernetes API server and etcd.
We are now ready to create the cluster!
+We just need a few variables for the template.
+The important part here is the CLUSTER_APIENDPOINT_HOST and CLUSTER_APIENDPOINT_PORT, since this will be used by the controllers to connect to the workload cluster API.
+You should set the IP to the private IP of the test machine or similar.
+This way we can use port-forwarding to expose the API on this IP, which the controllers can then reach.
+The port just have to be one not in use, and preferably something that is easy to remember and associate with the correct cluster.
+For example, cluster 1 gets port 10001, cluster 2 gets 10002, etc.
This will give you a cluster and all the templates and other resources that are needed.
+However, we will need to fill in for the non-existent hardware and create the workload cluster API server, like we practiced before.
+This time it is slightly different, because some of the steps are handled by the Cluster API.
+We just need to take care of what would happen on the node, plus the etcd part since we are using external etcd configuration.
+
+
mkdir-p"/tmp/${CLUSTER}/pki/etcd"
+
+# Generate etcd client certificate
+openssl req -newkey rsa:2048 -nodes-subj"/CN=${CLUSTER}"\
+ -keyout"/tmp/${CLUSTER}/pki/apiserver-etcd-client.key"-out"/tmp/${CLUSTER}/pki/apiserver-etcd-client.csr"
+openssl x509 -req-in"/tmp/${CLUSTER}/pki/apiserver-etcd-client.csr"\
+ -CA /tmp/pki/etcd/ca.crt -CAkey /tmp/pki/etcd/ca.key -CAcreateserial\
+ -out"/tmp/${CLUSTER}/pki/apiserver-etcd-client.crt"-days 365
+
+# Get the k8s ca certificate and key.
+# This is used by kubeadm to generate the api server certificates
+kubectl -n"${NAMESPACE}" get secrets "${CLUSTER}-ca"-ojsonpath="{.data.tls\.crt}" | base64-d>"/tmp/${CLUSTER}/pki/ca.crt"
+kubectl -n"${NAMESPACE}" get secrets "${CLUSTER}-ca"-ojsonpath="{.data.tls\.key}" | base64-d>"/tmp/${CLUSTER}/pki/ca.key"
+
+# Generate certificates
+sed-e"s/NAMESPACE/${NAMESPACE}/g"-e"s/CLUSTER/${CLUSTER}/g"-e"s/HOST/${CLUSTER_APIENDPOINT_HOST}/g"\
+ /tmp/kubeadm-config-template.yaml >"/tmp/kubeadm-config-${CLUSTER}.yaml"
+kubeadm init phase certs apiserver --config"/tmp/kubeadm-config-${CLUSTER}.yaml"
+
+# Create secrets
+kubectl -n"${NAMESPACE}" create secret tls "${CLUSTER}-apiserver-etcd-client"--cert"/tmp/${CLUSTER}/pki/apiserver-etcd-client.crt"--key"/tmp/${CLUSTER}/pki/apiserver-etcd-client.key"
+kubectl -n"${NAMESPACE}" create secret tls apiserver --cert"/tmp/${CLUSTER}/pki/apiserver.crt"--key"/tmp/${CLUSTER}/pki/apiserver.key"
+
+
+
Now we will need to set up the fake cluster resources.
+For this we will create a second kind cluster and set up etcd, just like we did before.
Switch the context back to the first cluster with kubectl config use-context kind-kind so we don’t get confused about which is the main cluster.
+We will now need to put all the expected certificates for the fake cluster in the kind-backing-cluster-1 so that they can be used by the API server that we will deploy there.
+
+
CLUSTER=test-1
+NAMESPACE=test-1
+# Setup fake resources for cluster test-1
+kubectl --context=kind-backing-cluster-1 create namespace "${NAMESPACE}"
+kubectl --context=kind-backing-cluster-1 -n"${NAMESPACE}" create secret tls "${CLUSTER}-etcd"--cert /tmp/pki/etcd/ca.crt --key /tmp/pki/etcd/ca.key
+kubectl --context=kind-backing-cluster-1 -n"${NAMESPACE}" create secret tls "${CLUSTER}-ca"--cert /tmp/pki/ca.crt --key /tmp/pki/ca.key
+kubectl --context=kind-backing-cluster-1 -n"${NAMESPACE}" create secret tls "${CLUSTER}-apiserver-etcd-client"--cert"/tmp/${CLUSTER}/pki/apiserver-etcd-client.crt"--key"/tmp/${CLUSTER}/pki/apiserver-etcd-client.key"
+kubectl --context=kind-backing-cluster-1 -n"${NAMESPACE}" create secret tls apiserver --cert"/tmp/${CLUSTER}/pki/apiserver.crt"--key"/tmp/${CLUSTER}/pki/apiserver.key"
+
+kubectl -n"${NAMESPACE}" get secrets "${CLUSTER}-sa"-o yaml | kubectl --context=kind-backing-cluster-1 create -f -
+
+## Create etcd tenant
+# Create user
+kubectl --context=kind-backing-cluster-1 -n etcd-system exec etcd-0 -- etcdctl --user root:rootpw \
+ --key=/etc/kubernetes/pki/etcd/tls.key --cert=/etc/kubernetes/pki/etcd/tls.crt --cacert /etc/kubernetes/pki/ca/tls.crt \
+ user add "${CLUSTER}"--new-user-password="${CLUSTER}"
+# Create role
+kubectl --context=kind-backing-cluster-1 -n etcd-system exec etcd-0 -- etcdctl --user root:rootpw \
+ --key=/etc/kubernetes/pki/etcd/tls.key --cert=/etc/kubernetes/pki/etcd/tls.crt --cacert /etc/kubernetes/pki/ca/tls.crt \
+ role add "${CLUSTER}"
+# Add read/write permissions for prefix to the role
+kubectl --context=kind-backing-cluster-1 -n etcd-system exec etcd-0 -- etcdctl --user root:rootpw \
+ --key=/etc/kubernetes/pki/etcd/tls.key --cert=/etc/kubernetes/pki/etcd/tls.crt --cacert /etc/kubernetes/pki/ca/tls.crt \
+ role grant-permission "${CLUSTER}"--prefix=true readwrite "/${CLUSTER}/"
+# Give the user permissions from the role
+kubectl --context=kind-backing-cluster-1 -n etcd-system exec etcd-0 -- etcdctl --user root:rootpw \
+ --key=/etc/kubernetes/pki/etcd/tls.key --cert=/etc/kubernetes/pki/etcd/tls.crt --cacert /etc/kubernetes/pki/ca/tls.crt \
+ user grant-role "${CLUSTER}""${CLUSTER}"
+
+
+
Check that the Metal3Machine is associated with a BareMetalHost.
+Deploy the API server.
+
+
# Deploy API server
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/manifests/v2/kube-apiserver-deployment.yaml |
+ sed-e"s/CLUSTER/${CLUSTER}/g" | kubectl --context=kind-backing-cluster-1 -n"${NAMESPACE}" apply -f -
+kubectl --context=kind-backing-cluster-1 -n"${NAMESPACE}"wait--for=condition=Available deploy/test-kube-apiserver
+
+# Get kubeconfig
+clusterctl -n"${NAMESPACE}" get kubeconfig "${CLUSTER}">"/tmp/kubeconfig-${CLUSTER}.yaml"
+# Edit kubeconfig to point to 127.0.0.1:${CLUSTER_APIENDPOINT_PORT}
+sed-i-e"s/${CLUSTER_APIENDPOINT_HOST}/127.0.0.1/"-e"s/:6443/:${CLUSTER_APIENDPOINT_PORT}/""/tmp/kubeconfig-${CLUSTER}.yaml"
+# Port forward for accessing the API
+kubectl --context=kind-backing-cluster-1 -n"${NAMESPACE}" port-forward \
+ --address"${CLUSTER_APIENDPOINT_HOST},127.0.0.1" svc/test-kube-apiserver "${CLUSTER_APIENDPOINT_PORT}":6443 &
+# Check that it is working
+kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml" cluster-info
+
+
+
Now that we have a working API for the workload cluster, the only remaining thing is to put everything that the controllers expect in it.
+This includes adding a Node to match the Machine as well as static pods that Cluster API expects to be there.
+Let’s start with the Node!
+The Node must have the correct name and a label with the BareMetalHost UID so that the controllers can put the correct provider ID on it.
+We have only created 1 BareMetalHost so it is easy to pick the correct one.
+The name of the Node should be the same as the Machine, which is also only a single one.
+
+
machine="$(kubectl -n"${NAMESPACE}" get machine -ojsonpath="{.items[0].metadata.name}")"
+bmh_uid="$(kubectl -n"${NAMESPACE}" get bmh -ojsonpath="{.items[0].metadata.uid}")"
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/fake-node.yaml |
+ sed-e"s/fake-node/${machine}/g"-e"s/fake-uuid/${bmh_uid}/g" | \
+ kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml" create -f -
+# Label it as control-plane since this is a control-plane node.
+kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml" label node "${machine}" node-role.kubernetes.io/control-plane=""
+# Upload kubeadm config to configmap. This will mark the KCP as initialized.
+kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml"-n kube-system create cm kubeadm-config \
+ --from-file=ClusterConfiguration="/tmp/kubeadm-config-${CLUSTER}.yaml"
+
+
+
This should be enough to make the Machines healthy!
+You should be able to see something similar to this:
However, if you check the KubeadmControlPlane more carefully, you will notice that it is still complaining about control plane components.
+This is because we have not created the static pods yet, and it is also unable to check the certificate expiration date for the Machine.
+Let’s fix it:
+
+
# Add static pods to make kubeadm control plane manager happy
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/kube-apiserver-pod.yaml |
+ sed"s/node-name/${machine}/g" |
+ kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml" create -f -
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/kube-controller-manager-pod.yaml |
+ sed"s/node-name/${machine}/g" |
+ kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml" create -f -
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/kube-scheduler-pod.yaml |
+ sed"s/node-name/${machine}/g" |
+ kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml" create -f -
+# Set status on the pods (it is not added when using create/apply).
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/kube-apiserver-pod-status.yaml |
+ kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml"-n kube-system patch pod "kube-apiserver-${machine}"\
+ --subresource=status --patch-file=/dev/stdin
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/kube-controller-manager-pod-status.yaml |
+ kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml"-n kube-system patch pod "kube-controller-manager-${machine}"\
+ --subresource=status --patch-file=/dev/stdin
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/kube-scheduler-pod-status.yaml |
+ kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml"-n kube-system patch pod "kube-scheduler-${machine}"\
+ --subresource=status --patch-file=/dev/stdin
+
+# Add certificate expiry annotations to make kubeadm control plane manager happy
+CERT_EXPIRY_ANNOTATION="machine.cluster.x-k8s.io/certificates-expiry"
+EXPIRY_TEXT="$(kubectl -n"${NAMESPACE}" get secret apiserver -ojsonpath="{.data.tls\.crt}" | base64-d | openssl x509 -enddate-noout | cut-d=-f 2)"
+EXPIRY="$(date--date="${EXPIRY_TEXT}"--iso-8601=seconds)"
+kubectl -n"${NAMESPACE}" annotate machine "${machine}""${CERT_EXPIRY_ANNOTATION}=${EXPIRY}"
+kubectl -n"${NAMESPACE}" annotate kubeadmconfig --all"${CERT_EXPIRY_ANNOTATION}=${EXPIRY}"
+
+
+
Now we finally have a completely healthy cluster as far as the controllers are concerned.
+
+
Conclusions and summary
+
+
We now have all the tools necessary to start experimenting.
+
+
+
With the BareMetal Operator running in test mode, we can skip Ironic and still work with BareMetalHosts that act like normal.
+
We can set up separate “backing” clusters where we run etcd and multiple API servers to fake the workload cluster API’s.
+
Fake Nodes and Pods can be easily added to the workload cluster API’s, and configured as we want.
+
The workload cluster API’s can be exposed to the controllers in the test cluster using port-forwarding.
+
+
+
In this post we have not automated all of this, but if you want to see a scripted setup, take a look at this.
+It is what we used to scale to 1000 clusters.
+Just remember that it may need some tweaking for your specific environment if you want to try it out!
+
+
Specifically we used 10 “backing” clusters, i.e. 10 separate cloud VMs with kind clusters where we run etcd and the workload cluster API’s.
+Each one would hold 100 API servers.
+The test cluster was on its own separate VM also running a kind cluster with all the controllers and all the Cluster objects, etc.
+
+
In the next and final blog post of this series we will take a look at the results of all this.
+What issues did we run into along the way?
+How did we fix or work around them?
+We will also take a look at what is going on in the community related to this and discuss potential future work in the area.
If you’re a developer or contributor to the Metal3 project, you may need
+to run the Metal3 website locally to test changes and ensure everything
+looks as expected before deploying them. In this guide, we’ll walk you
+through the process of setting up and running Metal3’s website locally
+on your machine using Jekyll.
+
+
Prerequisites
+
+
Before we begin, make sure you have the following prerequisites
+installed on your system:
+
+
+
+
Ruby: Jekyll, the static site generator used by Metal3, is built with
+Ruby. Install Ruby and its development tools by running the following
+command in your terminal:
+
+
sudo apt install ruby-full
+
+
+
+
+
Setting up Metal3’s Website
+
+
Once Ruby is installed, we can proceed to set up Metal3’s website and
+its dependencies. Follow these steps:
+
+
+
+
Clone the Metal3 website repository from GitHub. Open your terminal
+and navigate to the directory where you want to clone the repository,
+then run the following command:
Install the required gems and dependencies using Bundler. Run the
+following command:
+
+
bundle install
+
+
+
+
+
Running the Metal3 Website Locally
+
+
With Metal3’s website and its dependencies installed, you can now start the local
+development server to view and test the website. In the terminal, navigate to the
+project’s root directory (metal3-io.github.io) and run the following command:
+
+
bundle exec jekyll serve
+
+
+
This command tells Jekyll to build the website and start a local server.
+Once the server is running, you’ll see output indicating the local
+address where the Metal3 website is being served, typically
+http://localhost:4000.
+
+
Open your web browser and enter the provided address. Congratulations!
+You should now see the Metal3 website running locally, allowing you to
+preview your changes and ensure everything is working as expected.
+
+
Conclusion
+
+
Running Metal3’s website locally using Jekyll is a great way to test
+changes and ensure the site functions properly before deploying them. By
+following the steps outlined in this guide, you’ve successfully set up
+and run Metal3’s website locally. Feel free to explore the Metal3
+documentation and contribute to the project further.
The Metal3 project was present at KubeCon EU 2024 with multiple maintainers,
+contributors and users! For many of us, this was the first time we met in the
+physical world, despite working together for years already. This was very
+valuable and appreciated by many of us, I am sure. We had time to casually
+discuss ideas and proposals, hack together on the
+ironic-standalone-operator
+and simply get to know each other.
+
+
+
+
Photo by Michael Captain.
+
+
As a project, we had the opportunity to give an update through a lightning
+talk on Tuesday!
+
+
+
+
+
+
On Wednesday we continued with a contribfest session
+where we gave an introduction to the project for potential new contributors. We
+had prepared a number of good-first-issue’s that people could choose from if
+they wanted. Perhaps more important though, was that we had time to answer
+questions, discuss use-cases, issues and features with the attendees. The new
+quick-start page was also launched just in
+time for the contribfest. It should hopefully make it easier to get started with
+the project and we encourage everyone to run through it and report or fix any
+issues found.
+
+
+
+
Photo from the official CNCF Flickr. More photos
+here.
+
+
Finally, just like previous, we had a table in the Project Pavilion. There was a
+lot of interest in Metal3, more than last year I would say. Even with five
+maintainers working in parallel, we still had a hard time keeping up with the
+amount of people stopping by to ask questions! My takeaway from this event is
+that we still have work to do on explaining what Metal3 is and how it works. It
+is quite uncommon that people know about baseboard management controllers (BMCs)
+and this of course makes it harder to grasp what Metal3 is all about. However,
+the interest is there, so we just need to get the information out there so that
+people can learn! Another takeaway is that Cluster API in general seems to
+really take off. Many people that came by our kiosk knew about Cluster API and
+were interested in Metal3 because of the integration with have with it.
+
+
For those of you who couldn’t attend, I hope this post gives an idea about what
+happened at KubeCon related to Metal3. Did you miss the contribfest? Maybe you
+would like to contribute but don’t know where to start? Check out the
+good-first-issue’s!
+There are still plenty to choose from, and we will keep adding more.
In part 1, we introduced the
+Bare Metal Operator test mode and saw how it can be used to play with
+BareMetalHosts without Ironic and without any actual hosts. We continued in
+part 2 with how to fake
+workload clusters enough for convincing Cluster API’s controllers that they are
+healthy. These two pieces together allowed us to run scaling tests and reach our
+target of 1000 single node clusters. In this final part of the blog post series,
+we will take a look at the results, the issues that we encountered and the
+improvements that have been made.
+
+
+
Issues encountered and lessons learned
+
+
As part of this work we have learned a lot. We found genuine bugs and
+performance issues, but we also learned about relevant configuration options for
+Cluster API and controllers in general.
+
+
One of the first things we hit was this bug in Bare Metal
+Operator that
+caused endless requeues for some deleted objects. It was not a big deal, barely
+noticeable, at small scale. However, at larger scales things like this become a
+problem. The logs become unreadable as they are filled with “spam” from
+requeuing deleted objects and the controller is wasting resources trying to
+reconcile them.
+
+
As mentioned, we also learned a lot from this experiment. For example, that all
+the controllers have flags for setting their concurrency, i.e. how many objects
+they reconcile in parallel. The default is 10, which works well in most cases,
+but for larger scales it may be necessary to tune this in order to speed up the
+reconciliation process.
+
+
The next thing we hit was rate limits! Both
+client-go
+and
+controller-runtime
+have default rate limits of 10 and 20 QPS (Queries Per Second) respectively that
+the controllers inherit unless overridden. In general, this is a good thing, as
+it prevents controllers from overloading the API server. They obviously become
+an issue once you scale far enough though. For us that happened when we got to
+600 clusters.
+
+
Why 600? The number was actually a good clue, and the reason we managed figure
+out what was wrong! Let’s break it down. By default, the Cluster API controller
+will reconcile objects every 10 minutes (=600 seconds) in addition to reacting
+to events. Each reconciliation will normally involve one or more API calls, so
+at 600 clusters, we would have at least one API call per second just from the
+periodic sync. In other words, the controllers would at this point use up a
+large part of their budget on periodic reconciliation and quickly reach their
+limit when adding reactions to events, such as the creation of a new cluster.
+
+
At the time, these rate limits were not configurable in the Cluster API
+controllers, so we had to patch the controllers to increase the limits. We have
+since then added flags to the controllers to make this configurable. If you
+found this interesting, you can read more about it in this
+issue.
+
+
With concurrency and rate limits taken care of, we managed to reach our target
+of 1000 clusters in reasonable time. However, there was still a problem with
+resource usage. The Kubeadm control plane controller was unreasonably CPU
+hungry!
+
+
Luckily, Cluster API has excellent debugging and monitoring tools
+available so it was easy to
+collect data and profile the controllers. A quick look at the dashboard
+confirmed that the Kubeadm control plane controller was indeed the culprit, with
+a CPU usage far higher than the other controllers.
+
+
+
+
We then collected some profiling data and found the cause of the CPU usage. It
+was generating new private keys for accessing the workload cluster API server
+every time it needed to access it. This is a CPU intensive operation, and it
+happened four times per reconciliation! The flame graph seen below clearly shows
+the four key generation operations, and makes it obvious that this is what takes
+up most of the time spent on the CPU for the controller.
+
+
+
+
Improvements
+
+
All issues mentioned in the previous section have been addressed. The Bare Metal
+Operator is no longer re-queuing deleted objects. All controllers have flags for
+setting their concurrency and rate limits, and the Kubeadm control plane
+controller is now caching and reusing the private keys instead of generating new
+ones every time.
+
+
The impact of all of this is that
+
+
+
the Bare Metal Operator has more readable logs and lower CPU usage,
+
users can configure rate limits for all Cluster API and Metal3 controllers if
+necessary, and
+
the Kubeadm control plane controller has a much lower CPU usage and faster
+reconciliation times.
+
+
+
Results
+
+
When we set out, it was simply not possible to reach a scale of 1000 clusters in
+a reasonable time. With the collaboration, help from maintainers and other
+community members, we managed to reach our target. It is now possible to manage
+thousands of workload clusters through a single Cluster API management cluster.
Cluster API itself now also has an in-memory
+provider
+which makes it almost trivial to test large scale scenarios. However, it must be
+noted that it can only be used to test the core, bootstrap and control plane
+providers. If you want to try it out, you can use the following script. Please
+note that this will still be CPU intensive, despite the improvements mentioned
+above. Creating 1000 clusters is no small task!
This should result in 1000 ready in-memory clusters (and a pretty hot laptop if
+you run it locally). On a laptop with an i9-12900H CPU, it took about 15 minutes
+until all clusters were ready.
+
+
Conclusion and next steps
+
+
We are very happy with the results we achieved. The community has been very
+helpful and responsive, and we are very grateful for all the help we received.
+Going forward, we will hopefully be able to run scale tests periodically to
+ensure that we are not regressing. Even small scale tests can be enough to
+detect performance regressions as long as we keep track of the performance
+metrics. This is something we hope to incorporate into the CI system in the
+future.
If you’ve ever tried scaling out Kubernetes clusters in a bare-metal
+environment, you’ll know that large-scale testing comes with serious challenges.
+Most of us don’t have access to enough physical servers—or even virtual
+machines—to simulate the kinds of large-scale environments we need for stress
+testing, especially when deploying hundreds or thousands of clusters.
+
+
That’s where this experiment comes in.
+
+
Using Metal3, we simulated a massive environment—provisioning 1000 single-node
+Kubernetes clusters—without any actual hardware. The trick? A combination of
+Fake Ironic Python Agents (IPA) and Fake Kubernetes API servers. These tools
+allowed us to run an entirely realistic Metal3 provisioning workflow while
+simulating thousands of nodes and clusters, all without needing a single real
+machine.
+
+
The motivation behind this was simple: to create a scalable testing environment
+that lets us validate Metal3’s performance, workflow, and reliability without
+needing an expensive hardware lab or virtual machine fleet. By simulating nodes
+and clusters, we could push the limits of Metal3’s provisioning process
+cost-effectively and time-efficiently.
+
+
In this post, I’ll explain exactly how it all works, from setting up multiple
+Ironic services to faking hardware nodes and clusters and sharing the lessons
+learned. Whether you’re a Metal3 user or just curious about how to test
+large-scale Kubernetes environments, it’ll surely be a good read. Let’s get
+started!
+
+
Prerequisites & Setup
+
+
Before diving into the fun stuff, let’s ensure we’re on the same page. You don’t
+need to be a Metal3 expert to follow along, but having a bit of background will
+help!
+
+
What You’ll Need to Know
+
+
Let’s start by ensuring you’re familiar with some essential tools and concepts
+that power Metal3 workflow. If you’re confident in your Metal3 skills, please
+feel free to skip this part.
+
+
A typical Metal3 Workflow
+
+
The following diagram explains a typical Metal3 workflow. We will, then, go into
+details of every component.
+
+
+
+
Cluster API (CAPI)
+
+
CAPI is a project that simplifies the deployment and management of Kubernetes
+clusters. It provides a consistent way to create, update, and scale clusters
+through Kubernetes-native APIs. The magic of CAPI is that it abstracts away many
+of the underlying details so that you can manage clusters on different platforms
+(cloud, bare metal, etc.) in a unified way.
+
+
Cluster API Provider Metal3 (CAPM3)
+
+
CAPM3 extends CAPI to work specifically with Metal3 environments. It connects
+the dots between CAPI, BMO, and Ironic, allowing Kubernetes clusters to be
+deployed on bare-metal infrastructure. It handles tasks like provisioning new
+nodes, registering them with Kubernetes, and scaling clusters.
+
+
Bare Metal Operator (BMO)
+
+
BMO is a controller that runs inside a Kubernetes cluster and works alongside
+Ironic to manage bare-metal infrastructure. It automates the lifecycle of
+bare-metal hosts, managing things like registering new hosts, powering them on
+or off, and monitoring their status.
+
+
Bare Metal Host (BMH)
+
+
A BMH is the Kubernetes representation of a bare-metal node. It contains
+information about how to reach the node it represents, and BMO monitors its
+desired state closely. When BMO notices that a BMH object state is requested to
+change (either by a human user or CAPM3), it will decide what needs to be done
+and tell Ironic.
+
+
Ironic & Ironic Python Agent (IPA)
+
+
+
Ironic is a bare-metal provisioning tool that handles tasks like booting
+servers, deploying bootable media (e.g., operating systems) to disk, and
+configuring hardware. Think of Ironic as the piece of software that manages
+actual physical servers. In a Metal3 workflow, Ironic receives orders from BMO
+and translates them into actionable steps. Ironic has multiple ways to interact
+with the machines, and one of them is the so-called “ agent-based direct deploy”
+method, which is commonly used by BMO. The agent mentioned is called Ironic
+Python Agent (IPA), which is a piece of software that runs on each bare-metal
+node and carries out Ironic’s instructions. It interacts with the hardware
+directly, like wiping disks, configuring networks, and handling boot processes.
+
+
+
In a typical Metal3 workflow, BMO reads the desired state of the node from the
+BMH object, translates the Kubernetes reconciling logic to concrete actions, and
+forwards them to Ironic, which, as part of the provisioning process, tells IPA
+the exact steps it needs to perform to get the nodes to desired states. During
+the first boot after node image installation, Kubernetes components will be
+installed on the nodes by cloud-init, and once the process succeeds, Ironic
+and IPA finish the provisioning process, and CAPI and CAPM3 will verify the
+health of the newly provisioned Kubernetes cluster(s).
+
+
The Experiment: Simulating 1000 Kubernetes Clusters
+
+
This experiment aimed to push Metal3 to simulate 1000 single-node Kubernetes
+clusters on fake hardware. Instead of provisioning real machines, we used Fake
+Ironic Python Agents (Fake IP) and Fake Kubernetes API Servers (FKAS) to
+simulate nodes and control planes, respectively. This setup allowed us to test a
+massive environment without the need for physical infrastructure.
+
+
Since our goal is to verify the Metal3 limit, our setup will let all the Metal3
+components (except for IPA, which runs inside and will be scaled with the nodes)
+to keep working as they do in a typical workflow. In fact, none of the
+components should be aware that they are running with fake hardware.
+
+
Take the figure we had earlier as a base, here is the revised workflow with fake
+nodes.
+
+
+
+
Step 1: Setting Up the environment
+
+
As you may have known, a typical Metal3 workflow requires several components:
+bootstrap Kubernetes cluster, possible external networks, bare-metal nodes, etc.
+As we are working on simulating the environment, we will start with a newly
+spawned Ubuntu VM, create a cluster with minikube, add networks with libvirt,
+and so on (If you’re familiar with Metal3’s dev-env, this step is similar to
+what script
+01,
+02
+and a part of
+03
+do). We will not discuss this part, but you can find the related setup from
+this
+script
+if interested.
+
+
Note: If you intend to follow along, note that going to 1000 nodes requires
+a large environment and will take a long time. In our setup, we had a VM with 24
+cores and 32GB of RAM, of which we assigned 14 cores and 20GB of RAM to the
+minikube VM, and the process took roughly 48 hours. If your environment is less
+powerful, consider reducing the nodes you want to provision. Something like 100
+nodes will require minimal resources and time while still being impressive.
+
+
Step 2: Install BMO and Ironic
+
+
In Metal3’s typical workflow, we usually rely on Kustomize to install Ironic and
+BMO. Kustomize helps us define configurations for Kubernetes resources, making
+it easier to customize and deploy services. However, our current Kustomize
+overlay for Metal3 configures only a single Ironic instance. This setup works
+well for smaller environments, but it becomes a bottleneck when scaling up and
+handling thousands of nodes.
+
+
That’s where Ironic’s special mode comes into play. Ironic has the ability
+to run multiple Ironic conductors while sharing the same database. The best
+part? Workload balancing between conductors happens automatically, which means
+that no matter which Ironic conductor receives a request, the load is evenly
+distributed across all conductors, ensuring efficient provisioning. Achieving
+this requires separating ironic conductor from the database, which allows us
+to scale up the conductor part. Each conductor will have its own
+PROVISIONING_IP, hence the need to have a specialized configMap.
+
+
We used Helm for this purpose. In our Helm chart, the
+Ironic conductor container and HTTP server (httpd) container are
+separated into a new pod, and the rest of the ironic package (mostly
+MariaDB-ironic database) stays in another pod. A list of PROVISIONING_IPs is
+provided by the chart’s values.yaml, and for each IP, an ironic conductor
+pod is created, along with a config map whose values are rendered with the IP’s
+value. This way, we can dynamically scale up/down ironic (or, more specifically,
+ironic conductors) by simply adding/removing ips.
+
+
Another piece of information that we need to keep in mind is the ipa-downloader
+container. In our current metal3-dev-env, the IPA-downloader container runs as
+an init Container for ironic, and its job is to download the IPA image to a
+Persistent Volume. This image contains the Ironic Python Agent, and it is
+assumed to exist by Ironic. For the multiple-conductor scenario, running the
+same init-container for all the conductors, at the same time, could be slow
+and/or fail due to network issue. To make it work, we made a small “hack” in the
+chart: the ipa image will exist in a specific location inside the minikube host,
+and all the conductor pods will mount to that same location. In production, a
+more throughout solution might be to keep the IPA-downloader as an
+init-container, but points the image to the local image server, which we set up
+in the previous step.
+
+
BMO, on the other hand, still works well with kustomize, as we do not need to
+scale it. As with typical metal3 workflow, BMO and Ironic must share some
+authentication to work with TLS.
+
+
You can check out the full Ironic helm chart
+here.
+
+
Step 3: Creating Fake Nodes with Fake Ironic Python Agents
+
+
As we mentioned at the beginning, instead of using real hardware, we will use a
+new tool called Fake Ironic Python Agent, or Fake IPA to simulate the
+nodes.
+
+
Setting up Fake IPA is relatively straightforward, as Fake IPA runs as
+containers on the host machine, but first, we need to create the list of “nodes”
+that we will use (Fake IPA requires to have that list ready when it starts). A
+“node” typically looks like this
All of the variables (uuid, node_name, macaddress) can be dynamically
+generated in any way you want (check this
+script
+out if you need an idea). Still, we must store this information to generate the
+BMH objects that match those “nodes.” The ip is, on the other hand, not
+essential. It could be anything.
+
+
We must also start up the sushy-tools container in this step. It is a tool
+that simulates the Baseboard Management
+Controller
+for non-bare-metal hardware, and we have been using it extensively inside Metal3
+dev-env and CI to control and provision VMs as if they are bare-metal nodes. In
+a bare-metal setup, Ironic will ask the BMC to install IPA on the node, and in
+our setup, sushy-tools will get the same request, but it will simply fake
+the installation and, in the end, forward Ironic traffic to the Fake IPA
+container.
+
+
Another piece of information we will need is the cert that Ironic will use
+in its communication with IPA. IPA is supposed to get it from Ironic, but as
+Fake IPA cannot do that (at least not yet), we must get the cert and provide
+it in Fake IPA config.
Also note that one set of sushy-tools and Fake IPA containers won’t be
+enough to provision 1000 nodes. Just like Ironic, they need to be scaled up
+extensively (about 20-30 pairs will be sufficient for 1000 nodes), but
+fortunately, the scaling is straightforward: We just need to give them different
+ports. Both of these components also require a Python-based config file. For
+convenience, in this setup, we create a big file and provide it to both of them,
+using the following shell script:
+
+
for i in$(seq 1 "$N_SUSHY");do
+ container_conf_dir="$SUSHY_CONF_DIR/sushy-$i"
+
+ # Use round-robin to choose fake-ipa and sushy-tools containers for the node
+ fake_ipa_port=$((9901+(($i%${N_FAKE_IPA:-1}))))
+ sushy_tools_port=$((8000+ i))
+ ports+=(${sushy_tools_port})
+
+ # This is only so that we have the list of the needed ports for other
+ # purposes, like configuring the firewalls.
+ ports+=(${fake_ipa_port})
+
+ mkdir-p"${container_conf_dir}"
+
+ # Generate the htpasswd file, which is required by sushy-tools
+ cat<<'EOF' >"${container_conf_dir}"/htpasswd
+admin:$2b$12$/dVOBNatORwKpF.ss99KB.vESjfyONOxyH.UgRwNyZi1Xs/W2pGVS
+EOF
+
+# Set configuration options
+ cat<<EOF >"${container_conf_dir}"/conf.py
+import collections
+
+SUSHY_EMULATOR_LIBVIRT_URI = "${LIBVIRT_URI}"
+SUSHY_EMULATOR_IGNORE_BOOT_DEVICE = False
+SUSHY_EMULATOR_VMEDIA_VERIFY_SSL = False
+SUSHY_EMULATOR_AUTH_FILE = "/root/sushy/htpasswd"
+SUSHY_EMULATOR_FAKE_DRIVER = True
+SUSHY_EMULATOR_LISTEN_PORT = "${sushy_tools_port}"
+EXTERNAL_NOTIFICATION_URL = "http://${ADVERTISE_HOST}:${fake_ipa_port}"
+FAKE_IPA_API_URL = "${API_URL}"
+FAKE_IPA_URL = "http://${ADVERTISE_HOST}:${fake_ipa_port}"
+FAKE_IPA_INSPECTION_CALLBACK_URL = "${CALLBACK_URL}"
+FAKE_IPA_ADVERTISE_ADDRESS_IP = "${ADVERTISE_HOST}"
+FAKE_IPA_ADVERTISE_ADDRESS_PORT = "${fake_ipa_port}"
+FAKE_IPA_CAFILE = "/root/cert/ironic-ca.crt"
+SUSHY_FAKE_IPA_LISTEN_IP = "${ADVERTISE_HOST}"
+SUSHY_FAKE_IPA_LISTEN_PORT = "${fake_ipa_port}"
+SUSHY_EMULATOR_FAKE_IPA = True
+SUSHY_EMULATOR_FAKE_SYSTEMS = $(cat nodes.json)
+EOF
+
+# Start sushy-tools
+ docker run -d--net host --name"sushy-tools-${i}"\
+ -v"${container_conf_dir}":/root/sushy \
+ "${SUSHY_TOOLS_IMAGE}"
+
+ # Start fake-ipa
+ docker run \
+ -d--net host --name fake-ipa-${i}\
+ -v"${container_conf_dir}":/app \
+ -v"$(realpath cert)":/root/cert \
+ "${FAKEIPA_IMAGE}"
+done
+
+
+
In this setup, we made it so that all the sushy-tools containers will
+listen on the port range running from 8001, 8002,…, while the Fake IPA
+containers have ports 9001, 9002,…
+
+
Step 4: Add the BMH objects
+
+
Now that we have sushy-tools and Fake IPA containers running, we can
+already generate the manifest for BMH objects, and apply them to the cluster. A
+BMH object will look like this
name is the node name we generated in the previous step.
+
uuid is the random uuid we generated for the same node.
+
random_mac is a random mac address for the boot. It’s NOT the same as the
+NIC mac address we generated for the node.
+
port is the listening port on one of the sushy-tools containers we
+created in the previous step. Since every sushy-tools and Fake IPA
+container has information about ALL the nodes, we can decide what container to
+locate the “node”. In general, it’s a good idea to spread them out, so all
+containers are loaded equally.
+
+
+
We can now run kubectl apply -f on one (or all of) the BMH manifests. What you
+expect to see is that a BMH object is created, and its state will change from
+registering to available after a while. It means ironic acknowledged
+that the node is valid, in good state and ready to be provisioned.
+
+
Step 5: Deploy the fake nodes to kubernetes clusters
+
+
Before provisioning our clusters, let’s init the process, so that we have CAPI
+and CAPM3 installed
+
+
clusterctl init --infrastructure=metal3
+
+
+
After a while, we should see that CAPI, CAPM3, and IPAM pods become available.
+
+
In a standard Metal3 workflow, after having the BMH objects in an available
+state, we can provision new Kubernetes clusters with clusterctl. However, with
+fake nodes, things get a tiny bit more complex. At the end of the provisioning
+process, Cluster API expects that there is a new kubernetes API server
+created for the new cluster, from which it will check if all nodes are up, all
+the control planes have apiserver, etcd, etc. pods up and running, and so
+on. It is where the Fake Kubernetes API Server
+(FKAS)
+comes in.
+
+
As the FKAS README linked above already described how it works, we won’t go
+into details. We simply need to send FKAS a register POST request (with
+the new cluster’s namespace and cluster name), and it will give us an IP and a
+port, which we can plug into our cluster template and then run clusterctl
+generate cluster.
+
+
Under the hood, FKAS generates unique API servers for different clusters.
+Each of the fake API servers does the following jobs:
+
+
+
Mimicking API Calls: The Fake Kubernetes API server was set up to respond to
+the essential Kubernetes API calls made during provisioning.
+
Node Registration: When CAPM3 registered nodes, the Fake API server returned
+success responses, making Metal3 believe the nodes had joined a real Kubernetes
+cluster.
+
Cluster Health and Status: The Fake API responded with “healthy” statuses,
+allowing CAPI/CAPM3 to continue its workflow without interruption.
+
Node Creation and Deletion: When CAPI queried for node status or attempted to
+add/remove nodes, the Fake API server responded realistically, ensuring the
+provisioning process continued smoothly.
+
Pretending to Host Kubelet: The Fake API server also simulated kubelet
+responses, which allowed CAPI/CAPM3 to interact with the fake clusters as though
+they were managing actual nodes.
+
+
+
Note that in this experiment, we provisioned every one of the 1000 fake nodes to
+a single-node cluster, but it’s possible to increase the number of control
+planes and worker nodes by changing the --control-plane-machine-count and
+worker-machine-count parameters in the clusterctl generate cluster command.
+However, you will need to ensure that all clusters’ total nodes do not exceed
+the number of BMHs.
+
+
As a glance, the whole simulation looks like this:
+
+
+
+
It will likely take some time, but once the BMHs are all provisioned, we should
+be able to verify that all, or at least, most of the clusters are in good shape:
+
+
# This will list the clusters.
+kubectl get clusters -A
+
+# This will determine the clusters' readiness.
+kubectl get kcp -A
+
+
+
+
For each cluster, it’s also a good idea to perform a clusterctl
+check.
+
+
+
Accessing the fake cluster
+
+
A rather interesting (but not essential for our goal) check we can perform on
+the fake clusters is to try accessing them. Let’s start with fetching a
+cluster’s kubeconfig:
+
+
clusterctl -n <cluster-namespace> get kubeconfig <cluster-name> > kubeconfig-<cluster-name>.yaml
+
+
+
As usual, clusterctl will generate a kubeconfig file, but we cannot use it
+just yet. Recall that we generated the API endpoint using FKAS; the address we
+have now will be a combination of a port with FKAS’s IP address, which isn’t
+accessible from outside the cluster. What we should do now is:
+
+
+
Edit the kubeconfig-<cluster-name>.yaml so that the endpoint is in the form
+localhost:<port>.
+
Port-forward the FKAS Pod to the same port the kubeconfig has shown.
+
+
+
And voila, now we can access the fake cluster with kubectl --kubeconfig
+kubeconfig-<cluster-name>.yaml. You can inspect its state and check the
+resources (nodes, pods, etc.), but we won’t be able to run any workload on it as
+it’s fake.
+
+
Results
+
+
In this post, we have demonstrated how it is possible to “generate”
+bare-metal-based Kubernetes clusters from thin air (or rather, a bunch of nodes
+that do not exist). Of course, these “clusters” are not very useful. Still,
+successfully provisioning them without letting any of our main components
+(CAPI, CAPM3, BMO, and Ironic) know they are working with fake
+hardware proves that Metal3 is capable of handling a heavy workload and
+provision multiple nodes/clusters.
+
+
If interested, you could also check (and try out) the experiment by yourself
+here.
In the beginning, there was
+metal3-dev-env. It could set up a
+virtualized “baremetal” lab and test all the components together. As Metal3
+matured, it grew in complexity and capabilities, with release branches, API
+versions, etc. Metal3-dev-env did everything from cloning the repositories and
+building the container images, to deploying the controllers and running tests,
+on top of setting up the virtual machines and the networks, of course. Needless
+to say, it became hard to understand and easy to misuse.
+
+
We tried reducing the scope a bit by introducing end to end tests directly in
+the Cluster API provider
+Metal3
+(CAPM3). However, metal3-dev-env was still very much entangled with CAPM3. It
+was at this point that I got tired of trying to gradually fix it and took the
+initiative to start from scratch with end to end tests in Baremetal Operator
+(BMO) instead.
+
+
Up until that point, we had been testing BMO through CAPM3 and the cluster API
+flow. It worked, but it was very inefficient. From the perspective on the
+Baremetal Operator, a test could look something like this:
+
+
+
Register 5 BareMetalHosts
+
Inspect the 5 BareMetalHosts
+
Provision the 5 BareMetalHosts all with the same image
+
Deprovision 1 BareMetalHost
+
Provision it again with another image
+
Deprovision another BareMetalHost
+
Provision it again with the other image
+
Continue in the same way with the rest of the BareMetalHosts…
+
Deprovision all BareMetalHosts
+
+
+
As you can see, it is very repetitive, constantly doing the same thing again and
+again. As a consequence of this and the complexity of metal3-dev-env, it was
+quite an effort to thoroughly test something related to BMO code. I was
+constantly questioning myself and the test environment. “Is it testing the code
+I wrote?” “Is it doing the relevant scenario?” “Is the configuration correct?”
+
+
Baremetal Operator end to end tests are born
+
+
Sometimes it is easier to start from scratch, so this is what we
+did. The Baremetal
+Operator end to end tests started out as a small script that only set up
+minikube, some VMs and a baseboard management controller (BMC) emulator. The
+goal was simple: do the minimum required to simulate a baremetal lab. From this,
+it was quite easy to build a test module that was responsible for deploying the
+necessary controllers and running some tests.
+
+
Notice the separation of concerns here! The test module expects a baremetal lab
+environment to be already existing and the script that sets up the environment
+is not involved in anyway with the tests or deployment of the controllers. This
+design is deliberate, with a clear goal that the test module should be useful
+across multiple environments. It should be possible to run the test suite
+against real baremetal labs with multiple different configurations. I am hoping
+that we will get a chance next year to try it for real in a baremetal lab.
+
+
How does it work?
+
+
The flexibility of the end to end module is possible through a configuration
+file. It can be used to configure everything from the image URL and checksum to
+the timeout limits. Since Ironic can be deployed in many different ways, it was
+also necessary to make this flexible. The user can optionally set up Ironic
+before the test, or provide a kustomization that will be applied automatically.
+A separate configuration file declares the BMCs that should be used in the
+tests.
+
+
The configuration that we use in
+CI
+shows how these files look like. As a proof of concept for the flexibility of
+the tests, it can be noted that we already have two different configurations.
+One for running the tests with Ironic and one for running them with BMO in
+fixture mode. The first is the “normal” mode, the latter means that BMO does not
+communicate with Ironic at all, it just pretends. While that obviously isn’t
+useful for any thorough tests, it still provides a quick and light weight test
+suite, and ensures that we do not get too attached to one particular
+configuration.
+
+
The test suite itself is made with Ginkgo and Gomega. Instead of building a long
+chain of checks and scenarios we have attempted to do small, isolated tests.
+This makes it possible to run multiple in parallel and shorten the test suite
+duration, as well as easily identify where exactly errors occur. In order to
+accomplish this, we make heavy use of the status
+annotation so that we can skip
+inspection when possible.
+
+
Where are we today?
+
+
It is already several months since we switched over to the BMO e2e test suite as
+the primary, and only required tests for pull requests in the BMO repository. We
+run the end to end test suite as GitHub
+workflows
+and it covers more than the metal3-dev-env and CAPM3 based tests from BMO
+perspective. That does not mean that we are done though. At the time of writing,
+there are several GitHub
+issues for improving and
+extending the tests. The progress has significantly slowed though, as can
+perhaps be expected, since the most essentials parts were implemented.
+
+
The future
+
+
In the future we hope to make the BMO end to end module and tooling more useful
+for local development and testing. It should be easy to spin up a minimal
+environment and test specific scenarios, also using Tilt. Additionally, we want
+to “rebase” the CAPM3 end to end tests on this work. It should be possible to
+reuse the code and tooling for simulating a baremetal lab so that we can get rid
+of the entanglement with metal3-dev-env.
In the beginning, there was metal3-dev-env. It could set up a virtualized “baremetal” lab and test all the components together. As Metal3 matured, it grew in complexity and capabilities, with release branches, API versions, etc. Metal3-dev-env did everything from cloning the repositories and building the container images, to deploying the...
If you’ve ever tried scaling out Kubernetes clusters in a bare-metal environment, you’ll know that large-scale testing comes with serious challenges. Most of us don’t have access to enough physical servers—or even virtual machines—to simulate the kinds of large-scale environments we need for stress testing, especially when deploying hundreds or...
In part 1, we introduced the Bare Metal Operator test mode and saw how it can be used to play with BareMetalHosts without Ironic and without any actual hosts. We continued in part 2 with how to fake workload clusters enough for convincing Cluster API’s controllers that they are healthy....
The Metal3 project was present at KubeCon EU 2024 with multiple maintainers, contributors and users! For many of us, this was the first time we met in the physical world, despite working together for years already. This was very valuable and appreciated by many of us, I am sure. We...
Introduction If you’re a developer or contributor to the Metal3 project, you may need to run the Metal3 website locally to test changes and ensure everything looks as expected before deploying them. In this guide, we’ll walk you through the process of setting up and running Metal3’s website locally on...
In part 1, we introduced the Bare Metal Operator test mode and saw how it can be used to play with BareMetalHosts without Ironic and without any actual hosts. Now we will take a look at the other end of the stack and how we can fake the workload cluster...
We want to ensure that Metal3 can scale to thousands of nodes and clusters. However, running tests with thousands of real servers is expensive and we don’t have access to any such large environment in the project. So instead we have been focusing on faking the hardware while trying to...
Running on bare metal has both benefits and drawbacks. You can get the best performance possible out of the hardware, but it can also be quite expensive and maybe not necessary for all workloads. Perhaps a hybrid cluster could give you the best of both? Raw power for the workload...
Metal3 project has introduced pivoting in its CI workflow. The motivation for pivoting is to move all the objects from the ephemeral/management cluster to a target cluster. This blog post will briefly introduce the concept of pivoting and the impact it has on the overall CI workflow. For the rest...
As a part of developing the Cluster API Provider Metal3 (CAPM3) v1alpha4 release, the Metal3 crew introduced a new project: its own IP Address Manager. This blog post will go through the motivations behind such a project, the features that it brings, its use in Metal3 and future work. What...
Metal3 supports multiple types of images for deployment, the most popular being QCOW2. We have recently added support for a feature of Ironic that improves deployments on constrained environments, raw image streaming. We’ll first dive into how Ironic deploys the images on the target hosts, and how raw image streaming...
Introduction This blog post describes how to deploy a bare metal cluster, a virtual one for simplicity, using Metal³/metal3-dev-env. We will briefly discuss the steps involved in setting up the cluster as well as some of the customization available. If you want to know more about the architecture of Metal³,...
Renaming of Cluster API provider Backwards compatibility for v1alpha3There is no backwards compatibility between v1alpha3 and v1alpha2 releases of the Cluster API provider for Metal3. For the v1alpha3 release of Cluster API, the Metal3 provider was renamed from cluster-api-provider-baremetal to cluster-api-provider-metal3. The Custom Resource Definitions were also modified. This post...
Conference talk: Metal³: Kubernetes Native Bare Metal Cluster Management - Maël Kimmerlin On the 20th of January at the Kubernetes and CNCF Finland Meetup, Maël Kimmerlin gave a brilliant presentation about the status of the Metal³ project. In this presentation, Maël starts giving a short introduction of the Cluster API...
Introduction to metal3-dev-env The metal3-dev-env is a collection of scripts in a GitHub repository inside the Metal³ project that aims to allow contributors and other interested users to run a fully functional Metal³ environment for testing and have a first contact with the project. Actually, metal3-dev-env sets up an emulated...
Conference talk: Metal³: Deploy Kubernetes on Bare Metal - Yolanda Robla, Red Hat Some of the most influential minds in the developer industry were landing in the gorgeous ancient city of Split, Croatia, to talk at the Shift Dev 2019 - Developer Conference about the most cutting-edge technologies, techniques and...
Conference talk: Introducing Metal³: Kubernetes Native Bare Metal Host Management - Russell Bryant & Doug Hellmann, Red Hat Metal³ (metal cubed/Kube) is a new open-source bare metal host provisioning tool created to enable Kubernetes-native infrastructure management. Metal³ enables the management of bare metal hosts via custom resources managed through the...
Conference talk: Extend Your Data Center to the Hybrid Edge - Red Hat Summit, May 2019, Paul Cormier, Burr Stutter and Garima Sharma A critical part of being successful in the hybrid cloud is being successful in your data centre with your own infrastructure. In this video, Paul Cormier, Burr...
Conference talk: Open Infrastructure Days UK 2019; Kubernetes-native Infrastructure: Managed Baremetal with Kubernetes Operators and OpenStack Ironic - Steve Hardy, Red Hat In this session, you can hear about a new effort to enable baremetal Kubernetes deployments using native interfaces, and in particular, the Kubernetes Operator framework, combined with OpenStack...
Conference talk: OpenStack Ironic and Bare Metal Infrastructure: All Abstractions Start Somewhere The history of cloud computing has rapidly layered abstractions on abstractions to deliver applications faster, more reliably, and easier. Serverless functions on top of containers on top of virtualization. However, at the bottom of every stack is physical...
Introduction The baremetal operator, documented at https://github.com/metal3-io/baremetal-operator/blob/master/docs/api.md, it’s the Operator in charge of definitions of physical hosts, containing information about how to reach the Out of Band management controller, URL with the desired image to provision, plus other properties related with hosts being used for provisioning instances. Quoting from the...
Originally posted at https://www.underkube.com/posts/2019-06-25-metal3/ In this blog post, I’m going to try to explain in my own words a high level overview of what Metal3 is, the motivation behind it and some concepts related to a ‘baremetal operator’. Let’s have some definitions! Custom Resource Definition The k8s API provides out-of-the-box...
The new stack Metal³ Uses OpenStack’s Ironic for Declarative Bare Metal Kubernetes Mike Melanson talks in this article about the Open Infrastructure Summit in Denver, Colorado. Where bare metal was one of the main leads of the event. During this event, the OpenStack Foundation unveil a new project called Metal³...
Originally posted at https://blog.russellbryant.net/post/2019/04/2019-04-30-metal-metal-kubed-bare-metal-provisioning-for-kubernetes/ Project Introduction There are a number of great open-source tools for bare metal host provisioning, including Ironic. Metal³ aims to build on these technologies to provide a Kubernetes native API for managing bare metal hosts via a provisioning stack that is also running on Kubernetes. We...
The Register; Raise some horns: Red Hat’s Metal³ aims to make Kubernetes on bare machines simple Max Smolaks talks in this article about the OpenInfra Days in the UK, 2019: where Metal³ was revealed earlier last week by Steve Hardy, Red Hat’s senior principal software engineer. The Open Infrastructure Days...
Want to contribute to the Metal3 Project? Here's everything you need to know.
+
+
+
+
+
+
About Metal3 Community
+
+
The Metal3 community is an open-source community dedicated to the advancement of the Metal3 project, a Kubernetes-based solution for managing bare metal infrastructure as code. Leveraging Kubernetes and tools like Ironic, Metal3 allows users to treat physical machines like virtual machines, simplifying provisioning, management, and scalability of bare metal resources.
+
+
If you want to learn more about the Metal3 community or get involved in the project, you can visit their official community page at Community Resources.
The Metal³ project (pronounced “metal cubed”) exists to provide components that allow you to do bare metal host management for Kubernetes. Metal³ works as a Kubernetes application, meaning it runs on Kubernetes and is managed through Kubernetes interfaces.
+
If you are looking for documentation about how to use Metal³, please check the user-guide.
+
+
+
+
+
+
+
Metal3 Component Overview
+
+
+
It is helpful to understand the high level architecture of of the Machine API Integration. Click on each step to learn more about that particular component.
+
+
+
The first component is the Bare Metal Actuator, which is an implementation of the Machine Actuator interface defined by the cluster-api project. This actuator reacts to changes to Machine objects and acts as a client of the BareMetalHost custom resources managed by the Bare Metal
+
+
+
+
The architecture also includes a new Bare Metal Operator, which includes the following:
+
A Controller for a new Custom Resource, BareMetalHost. This custom resource represents an inventory of known (configured or automatically discovered) bare metal hosts. When a Machine is created the Bare Metal Actuator will claim one of these hosts to be provisioned as a new Kubernetes node.
+ In response to BareMetalHost updates, the controller will perform bare metal host provisioning actions as necessary to reach the desired state.
+ The creation of the BareMetalHost inventory can be done in two ways:
+
+
Manually via creating BareMetalHost objects.
+
Optionally, automatically created via a bare metal host discovery process.
+ For more information about Operators, see the operator-sdk.
+
+
+
+
+
+
The operator manages a set of tools for controlling the power on the host, monitoring the host status, and provisioning images to the host. These tools run inside the pod with the operator, and do not require any configuration by the user.
Here are some answers to common questions, discover more about Metal3
+
+
+
+
+
+ What is the baremetal operator?
+
+
+
Baremetal Operator is a Kubernetes controller providing support for
+several custom resources, most importantly - BareMetalHosts.
+
+
+
+
+ What kind of boot processes can be paired with specific BMC protocols?
+
+
+
Drivers with “virtual media” in their name can use the virtual media
+technology to boot an ISO remotely. The other drivers require network
+boot, more specifically - iPXE.
+
+
+
+
+ What is Cluster API provider Metal3 (CAPM3)?
+
+
+
CAPM3 is an
+infrastructure provider
+for the Cluster API that uses Metal3 and Ironic to provision machines
+for your cluster.
+
+
+
+
+ How does Metal3 relate to Cluster API (CAPI)?
+
+
+
The Metal3 project includes the Cluster API Provider Metal3 (CAPM3) - an
+infrastructure provider for Cluster API.
+
+
+
+
+ What CPU architectures are supported?
+
+
+
Both x86_64 (Intel) and AARCH64 (Arm) are supported. Mixed architectures
+(e.g. some hosts x86_64, some - aarch64) are not supported yet.
+
+
+
+
+ What is IPMI?
+
+
+
IPMI is the acronym for Intelligent Platform Management Interface
+which is used to monitor hardware health (fans, voltage, temperature,
+etc). The specification is available at
+here
+and was created by a joint effort by several manufacturers. It allows us
+to also define the boot order and power status of the hardware.
+
+
+
+
+ What kinds of operating systems can be installed?
+
+
+
You can use any operating system that is available in a cloud format
+(e.g. qcow2). If you need first boot configuration, the image has to
+contain cloud-init or a similar first-boot tool.
+
+
+
+
+ Does Metal3 support provisioners other than Ironic?
+
+
+
While it’s technically possible to add more provisioners, only Ironic is
+supported now, and supporting other provisioners is not on the current
+roadmap.
+
+
+
+
+ How can one supply network configuration during provisioning?
+
+ Ironic is developed as part of OpenStack, does Metal3 require OpenStack?
+
+
+
Ironic can be used as a stand-alone service without any other OpenStack
+services. In fact, Baremetal Operator does not support any other
+OpenStack services.
+
+
+
+
+ Can I use my own operating system installer with Metal3?
+
+
+
You can use the live ISO workflow
+to attach a bootable ISO to the machine using virtual media. Note that
+Baremetal Operator will not track the installation process in this case
+and will consider the host active once the ISO is booted.
+
+
+
+
+ What is an out-of-band management controller?
+
+
+
Enterprise hardware usually has an integrated or optional controller
+that allows reaching the server even if it’s powered down, either via
+dedicated or shared nic. This controller allows some checks on the
+server hardware and also perform some settings like changing power
+status, changing Boot Order, etc. The Baremetal Operator uses it to
+power on, reboot and provision the physical servers to be used for
+running workloads on top. Commercial names include iDrac, iLO,
+iRMC, etc and most of them should support IPMI.
+
+
+
+
+ Do I need to use the Metal3 with Cluster API or can I use Metal3 independently?
+
+
+
It is completely optional to use Cluster API. You can use only the
+Baremetal Operator and skip CAPM3 completely if all you need is
+bare-metal provisioning via Kubernetes API.
+
+
+
+
+ What is Ironic and how does Metal3 relate to it?
+
+
+
Ironic is a bare metal provisioner, it handles provisioning of physical
+machines. Metal3 exposes a part of the Ironic functionality as a
+Kubernetes native API via the Baremetal Operator. Ironic is not part of
+Metal3 but Metal3 relies on Ironic to provision the bare metal hosts.
+
+
+
+
+ What is an operator?
+
+
+
An Operator is a method of packaging, deploying and managing a
+Kubernetes application. A Kubernetes application is an application that
+is both deployed on Kubernetes and managed using the Kubernetes APIs and
+kubectl tooling. You can think of Operators as the runtime that manages
+this type of application on Kubernetes. If you want to learn more about
+Operators you can check the Operator framework website
+https://operatorframework.io/what/
+
+
+
+
+ What is cleaning? Can I disable it?
+
+
+
Cleaning removes partitioning information from the disks to avoid
+conflicts with the new operating system. See
+automated cleaning for details.
+
+
+
+
+ What is inspection? Can I disable it?
+
+
+
Inspection is used to populate hardware information in the BareMetalHost
+objects. You can disable it,
+but you may need to populate this information yourself. Do not blindly
+disable inspection if it fails - chances are high the subsequent
+operations fail the same way.
+
+
+
+
+ What is iPXE?
+
+
+
The iPXE project develops firmware for booting machines over the
+network. It’s a more feature-rich alternative to the well known PXE and
+can be used as an add-on on top of PXE.
+
+
+
+
+ What is virtual media?
+
+
+
Virtual media is a technology that allows booting an ISO on a remote
+machine without resorting to network boot (e.g. PXE).
+
+
+
+
+ Why use Ironic?
+
+
+
Ironic is an established service with a long history of production usage
+and good support for industry standards. By using it, Metal3 can
+concentrate on providing the best integration with Kubernetes.
Baremetal Operator is a Kubernetes controller providing support for
+several custom resources, most importantly - BareMetalHosts.
diff --git a/faqs/boot-processes.html b/faqs/boot-processes.html
new file mode 100644
index 000000000..0ff7bcc8a
--- /dev/null
+++ b/faqs/boot-processes.html
@@ -0,0 +1,3 @@
+
Drivers with “virtual media” in their name can use the virtual media
+technology to boot an ISO remotely. The other drivers require network
+boot, more specifically - iPXE.
diff --git a/faqs/capm3.html b/faqs/capm3.html
new file mode 100644
index 000000000..a0eb0f8a5
--- /dev/null
+++ b/faqs/capm3.html
@@ -0,0 +1,4 @@
+
CAPM3 is an
+infrastructure provider
+for the Cluster API that uses Metal3 and Ironic to provision machines
+for your cluster.
diff --git a/faqs/cluster-api.html b/faqs/cluster-api.html
new file mode 100644
index 000000000..5288d6aeb
--- /dev/null
+++ b/faqs/cluster-api.html
@@ -0,0 +1,2 @@
+
The Metal3 project includes the Cluster API Provider Metal3 (CAPM3) - an
+infrastructure provider for Cluster API.
diff --git a/faqs/cpu-architectures.html b/faqs/cpu-architectures.html
new file mode 100644
index 000000000..12de4acdd
--- /dev/null
+++ b/faqs/cpu-architectures.html
@@ -0,0 +1,2 @@
+
Both x86_64 (Intel) and AARCH64 (Arm) are supported. Mixed architectures
+(e.g. some hosts x86_64, some - aarch64) are not supported yet.
diff --git a/faqs/ipmi.html b/faqs/ipmi.html
new file mode 100644
index 000000000..2f3b6f3ff
--- /dev/null
+++ b/faqs/ipmi.html
@@ -0,0 +1,6 @@
+
IPMI is the acronym for Intelligent Platform Management Interface
+which is used to monitor hardware health (fans, voltage, temperature,
+etc). The specification is available at
+here
+and was created by a joint effort by several manufacturers. It allows us
+to also define the boot order and power status of the hardware.
diff --git a/faqs/kinds-of-operating-systems.html b/faqs/kinds-of-operating-systems.html
new file mode 100644
index 000000000..b07ff9a0c
--- /dev/null
+++ b/faqs/kinds-of-operating-systems.html
@@ -0,0 +1,3 @@
+
You can use any operating system that is available in a cloud format
+(e.g. qcow2). If you need first boot configuration, the image has to
+contain cloud-init or a similar first-boot tool.
diff --git a/faqs/metal3-support-provisioners.html b/faqs/metal3-support-provisioners.html
new file mode 100644
index 000000000..574e0279f
--- /dev/null
+++ b/faqs/metal3-support-provisioners.html
@@ -0,0 +1,3 @@
+
While it’s technically possible to add more provisioners, only Ironic is
+supported now, and supporting other provisioners is not on the current
+roadmap.
diff --git a/faqs/network-configuration.html b/faqs/network-configuration.html
new file mode 100644
index 000000000..4d47dfee7
--- /dev/null
+++ b/faqs/network-configuration.html
@@ -0,0 +1,2 @@
+
diff --git a/faqs/openstack.html b/faqs/openstack.html
new file mode 100644
index 000000000..3ac1fc9af
--- /dev/null
+++ b/faqs/openstack.html
@@ -0,0 +1,3 @@
+
Ironic can be used as a stand-alone service without any other OpenStack
+services. In fact, Baremetal Operator does not support any other
+OpenStack services.
diff --git a/faqs/operating-system-installer.html b/faqs/operating-system-installer.html
new file mode 100644
index 000000000..26ebd779f
--- /dev/null
+++ b/faqs/operating-system-installer.html
@@ -0,0 +1,4 @@
+
You can use the live ISO workflow
+to attach a bootable ISO to the machine using virtual media. Note that
+Baremetal Operator will not track the installation process in this case
+and will consider the host active once the ISO is booted.
diff --git a/faqs/out-of-band.html b/faqs/out-of-band.html
new file mode 100644
index 000000000..584e1c669
--- /dev/null
+++ b/faqs/out-of-band.html
@@ -0,0 +1,8 @@
+
Enterprise hardware usually has an integrated or optional controller
+that allows reaching the server even if it’s powered down, either via
+dedicated or shared nic. This controller allows some checks on the
+server hardware and also perform some settings like changing power
+status, changing Boot Order, etc. The Baremetal Operator uses it to
+power on, reboot and provision the physical servers to be used for
+running workloads on top. Commercial names include iDrac, iLO,
+iRMC, etc and most of them should support IPMI.
diff --git a/faqs/using-metal3-independently.html b/faqs/using-metal3-independently.html
new file mode 100644
index 000000000..089e3b5b4
--- /dev/null
+++ b/faqs/using-metal3-independently.html
@@ -0,0 +1,3 @@
+
It is completely optional to use Cluster API. You can use only the
+Baremetal Operator and skip CAPM3 completely if all you need is
+bare-metal provisioning via Kubernetes API.
diff --git a/faqs/what-is-Ironic.html b/faqs/what-is-Ironic.html
new file mode 100644
index 000000000..aa8c585a6
--- /dev/null
+++ b/faqs/what-is-Ironic.html
@@ -0,0 +1,4 @@
+
Ironic is a bare metal provisioner, it handles provisioning of physical
+machines. Metal3 exposes a part of the Ironic functionality as a
+Kubernetes native API via the Baremetal Operator. Ironic is not part of
+Metal3 but Metal3 relies on Ironic to provision the bare metal hosts.
diff --git a/faqs/what-is-an-operator.html b/faqs/what-is-an-operator.html
new file mode 100644
index 000000000..0b71df862
--- /dev/null
+++ b/faqs/what-is-an-operator.html
@@ -0,0 +1,7 @@
+
An Operator is a method of packaging, deploying and managing a
+Kubernetes application. A Kubernetes application is an application that
+is both deployed on Kubernetes and managed using the Kubernetes APIs and
+kubectl tooling. You can think of Operators as the runtime that manages
+this type of application on Kubernetes. If you want to learn more about
+Operators you can check the Operator framework website
+https://operatorframework.io/what/
diff --git a/faqs/what-is-cleaning.html b/faqs/what-is-cleaning.html
new file mode 100644
index 000000000..6d67716fc
--- /dev/null
+++ b/faqs/what-is-cleaning.html
@@ -0,0 +1,3 @@
+
Cleaning removes partitioning information from the disks to avoid
+conflicts with the new operating system. See
+automated cleaning for details.
diff --git a/faqs/what-is-inspection.html b/faqs/what-is-inspection.html
new file mode 100644
index 000000000..63c761ee2
--- /dev/null
+++ b/faqs/what-is-inspection.html
@@ -0,0 +1,5 @@
+
Inspection is used to populate hardware information in the BareMetalHost
+objects. You can disable it,
+but you may need to populate this information yourself. Do not blindly
+disable inspection if it fails - chances are high the subsequent
+operations fail the same way.
diff --git a/faqs/what-is-ipxe.html b/faqs/what-is-ipxe.html
new file mode 100644
index 000000000..4ae2929f8
--- /dev/null
+++ b/faqs/what-is-ipxe.html
@@ -0,0 +1,3 @@
+
The iPXE project develops firmware for booting machines over the
+network. It’s a more feature-rich alternative to the well known PXE and
+can be used as an add-on on top of PXE.
diff --git a/faqs/what-is-virtual-media.html b/faqs/what-is-virtual-media.html
new file mode 100644
index 000000000..e853eaa13
--- /dev/null
+++ b/faqs/what-is-virtual-media.html
@@ -0,0 +1,2 @@
+
Virtual media is a technology that allows booting an ISO on a remote
+machine without resorting to network boot (e.g. PXE).
diff --git a/faqs/why-use-Ironic.html b/faqs/why-use-Ironic.html
new file mode 100644
index 000000000..bcc33c0cc
--- /dev/null
+++ b/faqs/why-use-Ironic.html
@@ -0,0 +1,3 @@
+
Ironic is an established service with a long history of production usage
+and good support for industry standards. By using it, Metal3 can
+concentrate on providing the best integration with Kubernetes.
diff --git a/favicon.png b/favicon.png
new file mode 100644
index 000000000..1b97f6b8a
Binary files /dev/null and b/favicon.png differ
diff --git a/feed.xml b/feed.xml
new file mode 100644
index 000000000..fd721d53e
--- /dev/null
+++ b/feed.xml
@@ -0,0 +1,2609 @@
+Jekyll2024-12-18T18:22:49-06:00https://metal3.io/feed.xmlMetal³ - Metal KubedMetal3.io aims to build on baremetal host provisioning technologies to provide a Kubernetes native API for managing bare metal hosts via a provisioning stack that is also running on Kubernetes.Introducing Baremetal Operator end-to-end test suite2024-12-13T00:00:00-06:002024-12-13T00:00:00-06:00https://metal3.io/blog/2024/12/13/Introducing-BMO-E2EIn the beginning, there was
+metal3-dev-env. It could set up a
+virtualized “baremetal” lab and test all the components together. As Metal3
+matured, it grew in complexity and capabilities, with release branches, API
+versions, etc. Metal3-dev-env did everything from cloning the repositories and
+building the container images, to deploying the controllers and running tests,
+on top of setting up the virtual machines and the networks, of course. Needless
+to say, it became hard to understand and easy to misuse.
+
+
We tried reducing the scope a bit by introducing end to end tests directly in
+the Cluster API provider
+Metal3
+(CAPM3). However, metal3-dev-env was still very much entangled with CAPM3. It
+was at this point that I got tired of trying to gradually fix it and took the
+initiative to start from scratch with end to end tests in Baremetal Operator
+(BMO) instead.
+
+
Up until that point, we had been testing BMO through CAPM3 and the cluster API
+flow. It worked, but it was very inefficient. From the perspective on the
+Baremetal Operator, a test could look something like this:
+
+
+
Register 5 BareMetalHosts
+
Inspect the 5 BareMetalHosts
+
Provision the 5 BareMetalHosts all with the same image
+
Deprovision 1 BareMetalHost
+
Provision it again with another image
+
Deprovision another BareMetalHost
+
Provision it again with the other image
+
Continue in the same way with the rest of the BareMetalHosts…
+
Deprovision all BareMetalHosts
+
+
+
As you can see, it is very repetitive, constantly doing the same thing again and
+again. As a consequence of this and the complexity of metal3-dev-env, it was
+quite an effort to thoroughly test something related to BMO code. I was
+constantly questioning myself and the test environment. “Is it testing the code
+I wrote?” “Is it doing the relevant scenario?” “Is the configuration correct?”
+
+
Baremetal Operator end to end tests are born
+
+
Sometimes it is easier to start from scratch, so this is what we
+did. The Baremetal
+Operator end to end tests started out as a small script that only set up
+minikube, some VMs and a baseboard management controller (BMC) emulator. The
+goal was simple: do the minimum required to simulate a baremetal lab. From this,
+it was quite easy to build a test module that was responsible for deploying the
+necessary controllers and running some tests.
+
+
Notice the separation of concerns here! The test module expects a baremetal lab
+environment to be already existing and the script that sets up the environment
+is not involved in anyway with the tests or deployment of the controllers. This
+design is deliberate, with a clear goal that the test module should be useful
+across multiple environments. It should be possible to run the test suite
+against real baremetal labs with multiple different configurations. I am hoping
+that we will get a chance next year to try it for real in a baremetal lab.
+
+
How does it work?
+
+
The flexibility of the end to end module is possible through a configuration
+file. It can be used to configure everything from the image URL and checksum to
+the timeout limits. Since Ironic can be deployed in many different ways, it was
+also necessary to make this flexible. The user can optionally set up Ironic
+before the test, or provide a kustomization that will be applied automatically.
+A separate configuration file declares the BMCs that should be used in the
+tests.
+
+
The configuration that we use in
+CI
+shows how these files look like. As a proof of concept for the flexibility of
+the tests, it can be noted that we already have two different configurations.
+One for running the tests with Ironic and one for running them with BMO in
+fixture mode. The first is the “normal” mode, the latter means that BMO does not
+communicate with Ironic at all, it just pretends. While that obviously isn’t
+useful for any thorough tests, it still provides a quick and light weight test
+suite, and ensures that we do not get too attached to one particular
+configuration.
+
+
The test suite itself is made with Ginkgo and Gomega. Instead of building a long
+chain of checks and scenarios we have attempted to do small, isolated tests.
+This makes it possible to run multiple in parallel and shorten the test suite
+duration, as well as easily identify where exactly errors occur. In order to
+accomplish this, we make heavy use of the status
+annotation so that we can skip
+inspection when possible.
+
+
Where are we today?
+
+
It is already several months since we switched over to the BMO e2e test suite as
+the primary, and only required tests for pull requests in the BMO repository. We
+run the end to end test suite as GitHub
+workflows
+and it covers more than the metal3-dev-env and CAPM3 based tests from BMO
+perspective. That does not mean that we are done though. At the time of writing,
+there are several GitHub
+issues for improving and
+extending the tests. The progress has significantly slowed though, as can
+perhaps be expected, since the most essentials parts were implemented.
+
+
The future
+
+
In the future we hope to make the BMO end to end module and tooling more useful
+for local development and testing. It should be easy to spin up a minimal
+environment and test specific scenarios, also using Tilt. Additionally, we want
+to “rebase” the CAPM3 end to end tests on this work. It should be possible to
+reuse the code and tooling for simulating a baremetal lab so that we can get rid
+of the entanglement with metal3-dev-env.
]]>Lennart JernScaling Kubernetes with Metal3: Simulating 1000 Clusters with Fake Ironic Agents2024-10-24T00:00:00-05:002024-10-24T00:00:00-05:00https://metal3.io/blog/2024/10/24/Scaling-Kubernetes-with-Metal3-on-Fake-NodeIf you’ve ever tried scaling out Kubernetes clusters in a bare-metal
+environment, you’ll know that large-scale testing comes with serious challenges.
+Most of us don’t have access to enough physical servers—or even virtual
+machines—to simulate the kinds of large-scale environments we need for stress
+testing, especially when deploying hundreds or thousands of clusters.
+
+
That’s where this experiment comes in.
+
+
Using Metal3, we simulated a massive environment—provisioning 1000 single-node
+Kubernetes clusters—without any actual hardware. The trick? A combination of
+Fake Ironic Python Agents (IPA) and Fake Kubernetes API servers. These tools
+allowed us to run an entirely realistic Metal3 provisioning workflow while
+simulating thousands of nodes and clusters, all without needing a single real
+machine.
+
+
The motivation behind this was simple: to create a scalable testing environment
+that lets us validate Metal3’s performance, workflow, and reliability without
+needing an expensive hardware lab or virtual machine fleet. By simulating nodes
+and clusters, we could push the limits of Metal3’s provisioning process
+cost-effectively and time-efficiently.
+
+
In this post, I’ll explain exactly how it all works, from setting up multiple
+Ironic services to faking hardware nodes and clusters and sharing the lessons
+learned. Whether you’re a Metal3 user or just curious about how to test
+large-scale Kubernetes environments, it’ll surely be a good read. Let’s get
+started!
+
+
Prerequisites & Setup
+
+
Before diving into the fun stuff, let’s ensure we’re on the same page. You don’t
+need to be a Metal3 expert to follow along, but having a bit of background will
+help!
+
+
What You’ll Need to Know
+
+
Let’s start by ensuring you’re familiar with some essential tools and concepts
+that power Metal3 workflow. If you’re confident in your Metal3 skills, please
+feel free to skip this part.
+
+
A typical Metal3 Workflow
+
+
The following diagram explains a typical Metal3 workflow. We will, then, go into
+details of every component.
+
+
+
+
Cluster API (CAPI)
+
+
CAPI is a project that simplifies the deployment and management of Kubernetes
+clusters. It provides a consistent way to create, update, and scale clusters
+through Kubernetes-native APIs. The magic of CAPI is that it abstracts away many
+of the underlying details so that you can manage clusters on different platforms
+(cloud, bare metal, etc.) in a unified way.
+
+
Cluster API Provider Metal3 (CAPM3)
+
+
CAPM3 extends CAPI to work specifically with Metal3 environments. It connects
+the dots between CAPI, BMO, and Ironic, allowing Kubernetes clusters to be
+deployed on bare-metal infrastructure. It handles tasks like provisioning new
+nodes, registering them with Kubernetes, and scaling clusters.
+
+
Bare Metal Operator (BMO)
+
+
BMO is a controller that runs inside a Kubernetes cluster and works alongside
+Ironic to manage bare-metal infrastructure. It automates the lifecycle of
+bare-metal hosts, managing things like registering new hosts, powering them on
+or off, and monitoring their status.
+
+
Bare Metal Host (BMH)
+
+
A BMH is the Kubernetes representation of a bare-metal node. It contains
+information about how to reach the node it represents, and BMO monitors its
+desired state closely. When BMO notices that a BMH object state is requested to
+change (either by a human user or CAPM3), it will decide what needs to be done
+and tell Ironic.
+
+
Ironic & Ironic Python Agent (IPA)
+
+
+
Ironic is a bare-metal provisioning tool that handles tasks like booting
+servers, deploying bootable media (e.g., operating systems) to disk, and
+configuring hardware. Think of Ironic as the piece of software that manages
+actual physical servers. In a Metal3 workflow, Ironic receives orders from BMO
+and translates them into actionable steps. Ironic has multiple ways to interact
+with the machines, and one of them is the so-called “ agent-based direct deploy”
+method, which is commonly used by BMO. The agent mentioned is called Ironic
+Python Agent (IPA), which is a piece of software that runs on each bare-metal
+node and carries out Ironic’s instructions. It interacts with the hardware
+directly, like wiping disks, configuring networks, and handling boot processes.
+
+
+
In a typical Metal3 workflow, BMO reads the desired state of the node from the
+BMH object, translates the Kubernetes reconciling logic to concrete actions, and
+forwards them to Ironic, which, as part of the provisioning process, tells IPA
+the exact steps it needs to perform to get the nodes to desired states. During
+the first boot after node image installation, Kubernetes components will be
+installed on the nodes by cloud-init, and once the process succeeds, Ironic
+and IPA finish the provisioning process, and CAPI and CAPM3 will verify the
+health of the newly provisioned Kubernetes cluster(s).
+
+
The Experiment: Simulating 1000 Kubernetes Clusters
+
+
This experiment aimed to push Metal3 to simulate 1000 single-node Kubernetes
+clusters on fake hardware. Instead of provisioning real machines, we used Fake
+Ironic Python Agents (Fake IP) and Fake Kubernetes API Servers (FKAS) to
+simulate nodes and control planes, respectively. This setup allowed us to test a
+massive environment without the need for physical infrastructure.
+
+
Since our goal is to verify the Metal3 limit, our setup will let all the Metal3
+components (except for IPA, which runs inside and will be scaled with the nodes)
+to keep working as they do in a typical workflow. In fact, none of the
+components should be aware that they are running with fake hardware.
+
+
Take the figure we had earlier as a base, here is the revised workflow with fake
+nodes.
+
+
+
+
Step 1: Setting Up the environment
+
+
As you may have known, a typical Metal3 workflow requires several components:
+bootstrap Kubernetes cluster, possible external networks, bare-metal nodes, etc.
+As we are working on simulating the environment, we will start with a newly
+spawned Ubuntu VM, create a cluster with minikube, add networks with libvirt,
+and so on (If you’re familiar with Metal3’s dev-env, this step is similar to
+what script
+01,
+02
+and a part of
+03
+do). We will not discuss this part, but you can find the related setup from
+this
+script
+if interested.
+
+
Note: If you intend to follow along, note that going to 1000 nodes requires
+a large environment and will take a long time. In our setup, we had a VM with 24
+cores and 32GB of RAM, of which we assigned 14 cores and 20GB of RAM to the
+minikube VM, and the process took roughly 48 hours. If your environment is less
+powerful, consider reducing the nodes you want to provision. Something like 100
+nodes will require minimal resources and time while still being impressive.
+
+
Step 2: Install BMO and Ironic
+
+
In Metal3’s typical workflow, we usually rely on Kustomize to install Ironic and
+BMO. Kustomize helps us define configurations for Kubernetes resources, making
+it easier to customize and deploy services. However, our current Kustomize
+overlay for Metal3 configures only a single Ironic instance. This setup works
+well for smaller environments, but it becomes a bottleneck when scaling up and
+handling thousands of nodes.
+
+
That’s where Ironic’s special mode comes into play. Ironic has the ability
+to run multiple Ironic conductors while sharing the same database. The best
+part? Workload balancing between conductors happens automatically, which means
+that no matter which Ironic conductor receives a request, the load is evenly
+distributed across all conductors, ensuring efficient provisioning. Achieving
+this requires separating ironic conductor from the database, which allows us
+to scale up the conductor part. Each conductor will have its own
+PROVISIONING_IP, hence the need to have a specialized configMap.
+
+
We used Helm for this purpose. In our Helm chart, the
+Ironic conductor container and HTTP server (httpd) container are
+separated into a new pod, and the rest of the ironic package (mostly
+MariaDB-ironic database) stays in another pod. A list of PROVISIONING_IPs is
+provided by the chart’s values.yaml, and for each IP, an ironic conductor
+pod is created, along with a config map whose values are rendered with the IP’s
+value. This way, we can dynamically scale up/down ironic (or, more specifically,
+ironic conductors) by simply adding/removing ips.
+
+
Another piece of information that we need to keep in mind is the ipa-downloader
+container. In our current metal3-dev-env, the IPA-downloader container runs as
+an init Container for ironic, and its job is to download the IPA image to a
+Persistent Volume. This image contains the Ironic Python Agent, and it is
+assumed to exist by Ironic. For the multiple-conductor scenario, running the
+same init-container for all the conductors, at the same time, could be slow
+and/or fail due to network issue. To make it work, we made a small “hack” in the
+chart: the ipa image will exist in a specific location inside the minikube host,
+and all the conductor pods will mount to that same location. In production, a
+more throughout solution might be to keep the IPA-downloader as an
+init-container, but points the image to the local image server, which we set up
+in the previous step.
+
+
BMO, on the other hand, still works well with kustomize, as we do not need to
+scale it. As with typical metal3 workflow, BMO and Ironic must share some
+authentication to work with TLS.
+
+
You can check out the full Ironic helm chart
+here.
+
+
Step 3: Creating Fake Nodes with Fake Ironic Python Agents
+
+
As we mentioned at the beginning, instead of using real hardware, we will use a
+new tool called Fake Ironic Python Agent, or Fake IPA to simulate the
+nodes.
+
+
Setting up Fake IPA is relatively straightforward, as Fake IPA runs as
+containers on the host machine, but first, we need to create the list of “nodes”
+that we will use (Fake IPA requires to have that list ready when it starts). A
+“node” typically looks like this
All of the variables (uuid, node_name, macaddress) can be dynamically
+generated in any way you want (check this
+script
+out if you need an idea). Still, we must store this information to generate the
+BMH objects that match those “nodes.” The ip is, on the other hand, not
+essential. It could be anything.
+
+
We must also start up the sushy-tools container in this step. It is a tool
+that simulates the Baseboard Management
+Controller
+for non-bare-metal hardware, and we have been using it extensively inside Metal3
+dev-env and CI to control and provision VMs as if they are bare-metal nodes. In
+a bare-metal setup, Ironic will ask the BMC to install IPA on the node, and in
+our setup, sushy-tools will get the same request, but it will simply fake
+the installation and, in the end, forward Ironic traffic to the Fake IPA
+container.
+
+
Another piece of information we will need is the cert that Ironic will use
+in its communication with IPA. IPA is supposed to get it from Ironic, but as
+Fake IPA cannot do that (at least not yet), we must get the cert and provide
+it in Fake IPA config.
Also note that one set of sushy-tools and Fake IPA containers won’t be
+enough to provision 1000 nodes. Just like Ironic, they need to be scaled up
+extensively (about 20-30 pairs will be sufficient for 1000 nodes), but
+fortunately, the scaling is straightforward: We just need to give them different
+ports. Both of these components also require a Python-based config file. For
+convenience, in this setup, we create a big file and provide it to both of them,
+using the following shell script:
+
+
for i in$(seq 1 "$N_SUSHY");do
+ container_conf_dir="$SUSHY_CONF_DIR/sushy-$i"
+
+ # Use round-robin to choose fake-ipa and sushy-tools containers for the node
+ fake_ipa_port=$((9901+(($i%${N_FAKE_IPA:-1}))))
+ sushy_tools_port=$((8000+ i))
+ ports+=(${sushy_tools_port})
+
+ # This is only so that we have the list of the needed ports for other
+ # purposes, like configuring the firewalls.
+ ports+=(${fake_ipa_port})
+
+ mkdir-p"${container_conf_dir}"
+
+ # Generate the htpasswd file, which is required by sushy-tools
+ cat<<'EOF' >"${container_conf_dir}"/htpasswd
+admin:$2b$12$/dVOBNatORwKpF.ss99KB.vESjfyONOxyH.UgRwNyZi1Xs/W2pGVS
+EOF
+
+# Set configuration options
+ cat<<EOF >"${container_conf_dir}"/conf.py
+import collections
+
+SUSHY_EMULATOR_LIBVIRT_URI = "${LIBVIRT_URI}"
+SUSHY_EMULATOR_IGNORE_BOOT_DEVICE = False
+SUSHY_EMULATOR_VMEDIA_VERIFY_SSL = False
+SUSHY_EMULATOR_AUTH_FILE = "/root/sushy/htpasswd"
+SUSHY_EMULATOR_FAKE_DRIVER = True
+SUSHY_EMULATOR_LISTEN_PORT = "${sushy_tools_port}"
+EXTERNAL_NOTIFICATION_URL = "http://${ADVERTISE_HOST}:${fake_ipa_port}"
+FAKE_IPA_API_URL = "${API_URL}"
+FAKE_IPA_URL = "http://${ADVERTISE_HOST}:${fake_ipa_port}"
+FAKE_IPA_INSPECTION_CALLBACK_URL = "${CALLBACK_URL}"
+FAKE_IPA_ADVERTISE_ADDRESS_IP = "${ADVERTISE_HOST}"
+FAKE_IPA_ADVERTISE_ADDRESS_PORT = "${fake_ipa_port}"
+FAKE_IPA_CAFILE = "/root/cert/ironic-ca.crt"
+SUSHY_FAKE_IPA_LISTEN_IP = "${ADVERTISE_HOST}"
+SUSHY_FAKE_IPA_LISTEN_PORT = "${fake_ipa_port}"
+SUSHY_EMULATOR_FAKE_IPA = True
+SUSHY_EMULATOR_FAKE_SYSTEMS = $(cat nodes.json)
+EOF
+
+# Start sushy-tools
+ docker run -d--net host --name"sushy-tools-${i}"\
+ -v"${container_conf_dir}":/root/sushy \
+ "${SUSHY_TOOLS_IMAGE}"
+
+ # Start fake-ipa
+ docker run \
+ -d--net host --name fake-ipa-${i}\
+ -v"${container_conf_dir}":/app \
+ -v"$(realpath cert)":/root/cert \
+ "${FAKEIPA_IMAGE}"
+done
+
+
+
In this setup, we made it so that all the sushy-tools containers will
+listen on the port range running from 8001, 8002,…, while the Fake IPA
+containers have ports 9001, 9002,…
+
+
Step 4: Add the BMH objects
+
+
Now that we have sushy-tools and Fake IPA containers running, we can
+already generate the manifest for BMH objects, and apply them to the cluster. A
+BMH object will look like this
name is the node name we generated in the previous step.
+
uuid is the random uuid we generated for the same node.
+
random_mac is a random mac address for the boot. It’s NOT the same as the
+NIC mac address we generated for the node.
+
port is the listening port on one of the sushy-tools containers we
+created in the previous step. Since every sushy-tools and Fake IPA
+container has information about ALL the nodes, we can decide what container to
+locate the “node”. In general, it’s a good idea to spread them out, so all
+containers are loaded equally.
+
+
+
We can now run kubectl apply -f on one (or all of) the BMH manifests. What you
+expect to see is that a BMH object is created, and its state will change from
+registering to available after a while. It means ironic acknowledged
+that the node is valid, in good state and ready to be provisioned.
+
+
Step 5: Deploy the fake nodes to kubernetes clusters
+
+
Before provisioning our clusters, let’s init the process, so that we have CAPI
+and CAPM3 installed
+
+
clusterctl init --infrastructure=metal3
+
+
+
After a while, we should see that CAPI, CAPM3, and IPAM pods become available.
+
+
In a standard Metal3 workflow, after having the BMH objects in an available
+state, we can provision new Kubernetes clusters with clusterctl. However, with
+fake nodes, things get a tiny bit more complex. At the end of the provisioning
+process, Cluster API expects that there is a new kubernetes API server
+created for the new cluster, from which it will check if all nodes are up, all
+the control planes have apiserver, etcd, etc. pods up and running, and so
+on. It is where the Fake Kubernetes API Server
+(FKAS)
+comes in.
+
+
As the FKAS README linked above already described how it works, we won’t go
+into details. We simply need to send FKAS a register POST request (with
+the new cluster’s namespace and cluster name), and it will give us an IP and a
+port, which we can plug into our cluster template and then run clusterctl
+generate cluster.
+
+
Under the hood, FKAS generates unique API servers for different clusters.
+Each of the fake API servers does the following jobs:
+
+
+
Mimicking API Calls: The Fake Kubernetes API server was set up to respond to
+the essential Kubernetes API calls made during provisioning.
+
Node Registration: When CAPM3 registered nodes, the Fake API server returned
+success responses, making Metal3 believe the nodes had joined a real Kubernetes
+cluster.
+
Cluster Health and Status: The Fake API responded with “healthy” statuses,
+allowing CAPI/CAPM3 to continue its workflow without interruption.
+
Node Creation and Deletion: When CAPI queried for node status or attempted to
+add/remove nodes, the Fake API server responded realistically, ensuring the
+provisioning process continued smoothly.
+
Pretending to Host Kubelet: The Fake API server also simulated kubelet
+responses, which allowed CAPI/CAPM3 to interact with the fake clusters as though
+they were managing actual nodes.
+
+
+
Note that in this experiment, we provisioned every one of the 1000 fake nodes to
+a single-node cluster, but it’s possible to increase the number of control
+planes and worker nodes by changing the --control-plane-machine-count and
+worker-machine-count parameters in the clusterctl generate cluster command.
+However, you will need to ensure that all clusters’ total nodes do not exceed
+the number of BMHs.
+
+
As a glance, the whole simulation looks like this:
+
+
+
+
It will likely take some time, but once the BMHs are all provisioned, we should
+be able to verify that all, or at least, most of the clusters are in good shape:
+
+
# This will list the clusters.
+kubectl get clusters -A
+
+# This will determine the clusters' readiness.
+kubectl get kcp -A
+
+
+
+
For each cluster, it’s also a good idea to perform a clusterctl
+check.
+
+
+
Accessing the fake cluster
+
+
A rather interesting (but not essential for our goal) check we can perform on
+the fake clusters is to try accessing them. Let’s start with fetching a
+cluster’s kubeconfig:
+
+
clusterctl -n <cluster-namespace> get kubeconfig <cluster-name> > kubeconfig-<cluster-name>.yaml
+
+
+
As usual, clusterctl will generate a kubeconfig file, but we cannot use it
+just yet. Recall that we generated the API endpoint using FKAS; the address we
+have now will be a combination of a port with FKAS’s IP address, which isn’t
+accessible from outside the cluster. What we should do now is:
+
+
+
Edit the kubeconfig-<cluster-name>.yaml so that the endpoint is in the form
+localhost:<port>.
+
Port-forward the FKAS Pod to the same port the kubeconfig has shown.
+
+
+
And voila, now we can access the fake cluster with kubectl --kubeconfig
+kubeconfig-<cluster-name>.yaml. You can inspect its state and check the
+resources (nodes, pods, etc.), but we won’t be able to run any workload on it as
+it’s fake.
+
+
Results
+
+
In this post, we have demonstrated how it is possible to “generate”
+bare-metal-based Kubernetes clusters from thin air (or rather, a bunch of nodes
+that do not exist). Of course, these “clusters” are not very useful. Still,
+successfully provisioning them without letting any of our main components
+(CAPI, CAPM3, BMO, and Ironic) know they are working with fake
+hardware proves that Metal3 is capable of handling a heavy workload and
+provision multiple nodes/clusters.
+
+
If interested, you could also check (and try out) the experiment by yourself
+here.
]]>Huy MaiScaling to 1000 clusters - Part 32024-05-30T00:00:00-05:002024-05-30T00:00:00-05:00https://metal3.io/blog/2024/05/30/Scaling_part_3
+
In part 1, we introduced the
+Bare Metal Operator test mode and saw how it can be used to play with
+BareMetalHosts without Ironic and without any actual hosts. We continued in
+part 2 with how to fake
+workload clusters enough for convincing Cluster API’s controllers that they are
+healthy. These two pieces together allowed us to run scaling tests and reach our
+target of 1000 single node clusters. In this final part of the blog post series,
+we will take a look at the results, the issues that we encountered and the
+improvements that have been made.
+
+
+
Issues encountered and lessons learned
+
+
As part of this work we have learned a lot. We found genuine bugs and
+performance issues, but we also learned about relevant configuration options for
+Cluster API and controllers in general.
+
+
One of the first things we hit was this bug in Bare Metal
+Operator that
+caused endless requeues for some deleted objects. It was not a big deal, barely
+noticeable, at small scale. However, at larger scales things like this become a
+problem. The logs become unreadable as they are filled with “spam” from
+requeuing deleted objects and the controller is wasting resources trying to
+reconcile them.
+
+
As mentioned, we also learned a lot from this experiment. For example, that all
+the controllers have flags for setting their concurrency, i.e. how many objects
+they reconcile in parallel. The default is 10, which works well in most cases,
+but for larger scales it may be necessary to tune this in order to speed up the
+reconciliation process.
+
+
The next thing we hit was rate limits! Both
+client-go
+and
+controller-runtime
+have default rate limits of 10 and 20 QPS (Queries Per Second) respectively that
+the controllers inherit unless overridden. In general, this is a good thing, as
+it prevents controllers from overloading the API server. They obviously become
+an issue once you scale far enough though. For us that happened when we got to
+600 clusters.
+
+
Why 600? The number was actually a good clue, and the reason we managed figure
+out what was wrong! Let’s break it down. By default, the Cluster API controller
+will reconcile objects every 10 minutes (=600 seconds) in addition to reacting
+to events. Each reconciliation will normally involve one or more API calls, so
+at 600 clusters, we would have at least one API call per second just from the
+periodic sync. In other words, the controllers would at this point use up a
+large part of their budget on periodic reconciliation and quickly reach their
+limit when adding reactions to events, such as the creation of a new cluster.
+
+
At the time, these rate limits were not configurable in the Cluster API
+controllers, so we had to patch the controllers to increase the limits. We have
+since then added flags to the controllers to make this configurable. If you
+found this interesting, you can read more about it in this
+issue.
+
+
With concurrency and rate limits taken care of, we managed to reach our target
+of 1000 clusters in reasonable time. However, there was still a problem with
+resource usage. The Kubeadm control plane controller was unreasonably CPU
+hungry!
+
+
Luckily, Cluster API has excellent debugging and monitoring tools
+available so it was easy to
+collect data and profile the controllers. A quick look at the dashboard
+confirmed that the Kubeadm control plane controller was indeed the culprit, with
+a CPU usage far higher than the other controllers.
+
+
+
+
We then collected some profiling data and found the cause of the CPU usage. It
+was generating new private keys for accessing the workload cluster API server
+every time it needed to access it. This is a CPU intensive operation, and it
+happened four times per reconciliation! The flame graph seen below clearly shows
+the four key generation operations, and makes it obvious that this is what takes
+up most of the time spent on the CPU for the controller.
+
+
+
+
Improvements
+
+
All issues mentioned in the previous section have been addressed. The Bare Metal
+Operator is no longer re-queuing deleted objects. All controllers have flags for
+setting their concurrency and rate limits, and the Kubeadm control plane
+controller is now caching and reusing the private keys instead of generating new
+ones every time.
+
+
The impact of all of this is that
+
+
+
the Bare Metal Operator has more readable logs and lower CPU usage,
+
users can configure rate limits for all Cluster API and Metal3 controllers if
+necessary, and
+
the Kubeadm control plane controller has a much lower CPU usage and faster
+reconciliation times.
+
+
+
Results
+
+
When we set out, it was simply not possible to reach a scale of 1000 clusters in
+a reasonable time. With the collaboration, help from maintainers and other
+community members, we managed to reach our target. It is now possible to manage
+thousands of workload clusters through a single Cluster API management cluster.
Cluster API itself now also has an in-memory
+provider
+which makes it almost trivial to test large scale scenarios. However, it must be
+noted that it can only be used to test the core, bootstrap and control plane
+providers. If you want to try it out, you can use the following script. Please
+note that this will still be CPU intensive, despite the improvements mentioned
+above. Creating 1000 clusters is no small task!
This should result in 1000 ready in-memory clusters (and a pretty hot laptop if
+you run it locally). On a laptop with an i9-12900H CPU, it took about 15 minutes
+until all clusters were ready.
+
+
Conclusion and next steps
+
+
We are very happy with the results we achieved. The community has been very
+helpful and responsive, and we are very grateful for all the help we received.
+Going forward, we will hopefully be able to run scale tests periodically to
+ensure that we are not regressing. Even small scale tests can be enough to
+detect performance regressions as long as we keep track of the performance
+metrics. This is something we hope to incorporate into the CI system in the
+future.
]]>Lennart JernMetal3 at KubeCon EU 20242024-04-10T00:00:00-05:002024-04-10T00:00:00-05:00https://metal3.io/blog/2024/04/10/Metal3_at_KubeCon_EU_2024The Metal3 project was present at KubeCon EU 2024 with multiple maintainers,
+contributors and users! For many of us, this was the first time we met in the
+physical world, despite working together for years already. This was very
+valuable and appreciated by many of us, I am sure. We had time to casually
+discuss ideas and proposals, hack together on the
+ironic-standalone-operator
+and simply get to know each other.
+
+
+
+
Photo by Michael Captain.
+
+
As a project, we had the opportunity to give an update through a lightning
+talk on Tuesday!
+
+
+
+
+
+
On Wednesday we continued with a contribfest session
+where we gave an introduction to the project for potential new contributors. We
+had prepared a number of good-first-issue’s that people could choose from if
+they wanted. Perhaps more important though, was that we had time to answer
+questions, discuss use-cases, issues and features with the attendees. The new
+quick-start page was also launched just in
+time for the contribfest. It should hopefully make it easier to get started with
+the project and we encourage everyone to run through it and report or fix any
+issues found.
+
+
+
+
Photo from the official CNCF Flickr. More photos
+here.
+
+
Finally, just like previous, we had a table in the Project Pavilion. There was a
+lot of interest in Metal3, more than last year I would say. Even with five
+maintainers working in parallel, we still had a hard time keeping up with the
+amount of people stopping by to ask questions! My takeaway from this event is
+that we still have work to do on explaining what Metal3 is and how it works. It
+is quite uncommon that people know about baseboard management controllers (BMCs)
+and this of course makes it harder to grasp what Metal3 is all about. However,
+the interest is there, so we just need to get the information out there so that
+people can learn! Another takeaway is that Cluster API in general seems to
+really take off. Many people that came by our kiosk knew about Cluster API and
+were interested in Metal3 because of the integration with have with it.
+
+
For those of you who couldn’t attend, I hope this post gives an idea about what
+happened at KubeCon related to Metal3. Did you miss the contribfest? Maybe you
+would like to contribute but don’t know where to start? Check out the
+good-first-issue’s!
+There are still plenty to choose from, and we will keep adding more.
]]>Lennart JernHow to run Metal3 website locally with Jekyll2024-01-18T00:00:00-06:002024-01-18T00:00:00-06:00https://metal3.io/blog/2024/01/18/How_to_run_Metal3_website_locally_with_JekyllIntroduction
+
+
If you’re a developer or contributor to the Metal3 project, you may need
+to run the Metal3 website locally to test changes and ensure everything
+looks as expected before deploying them. In this guide, we’ll walk you
+through the process of setting up and running Metal3’s website locally
+on your machine using Jekyll.
+
+
Prerequisites
+
+
Before we begin, make sure you have the following prerequisites
+installed on your system:
+
+
+
+
Ruby: Jekyll, the static site generator used by Metal3, is built with
+Ruby. Install Ruby and its development tools by running the following
+command in your terminal:
+
+
sudo apt install ruby-full
+
+
+
+
+
Setting up Metal3’s Website
+
+
Once Ruby is installed, we can proceed to set up Metal3’s website and
+its dependencies. Follow these steps:
+
+
+
+
Clone the Metal3 website repository from GitHub. Open your terminal
+and navigate to the directory where you want to clone the repository,
+then run the following command:
Install the required gems and dependencies using Bundler. Run the
+following command:
+
+
bundle install
+
+
+
+
+
Running the Metal3 Website Locally
+
+
With Metal3’s website and its dependencies installed, you can now start the local
+development server to view and test the website. In the terminal, navigate to the
+project’s root directory (metal3-io.github.io) and run the following command:
+
+
bundle exec jekyll serve
+
+
+
This command tells Jekyll to build the website and start a local server.
+Once the server is running, you’ll see output indicating the local
+address where the Metal3 website is being served, typically
+http://localhost:4000.
+
+
Open your web browser and enter the provided address. Congratulations!
+You should now see the Metal3 website running locally, allowing you to
+preview your changes and ensure everything is working as expected.
+
+
Conclusion
+
+
Running Metal3’s website locally using Jekyll is a great way to test
+changes and ensure the site functions properly before deploying them. By
+following the steps outlined in this guide, you’ve successfully set up
+and run Metal3’s website locally. Feel free to explore the Metal3
+documentation and contribute to the project further.
]]>Salima RabiuScaling to 1000 clusters - Part 22023-05-17T00:00:00-05:002023-05-17T00:00:00-05:00https://metal3.io/blog/2023/05/17/Scaling_part_2In part 1, we introduced the Bare Metal Operator test mode and saw how it can be used to play with BareMetalHosts without Ironic and without any actual hosts.
+Now we will take a look at the other end of the stack and how we can fake the workload cluster API’s.
+
+
Test setup
+
+
The end goal is to have one management cluster where the Cluster API and Metal3 controllers run.
+In this cluster we would generate BareMetalHosts and create Clusters, Metal3Clusters, etc to benchmark the controllers.
+To give them a realistic test, we also need to fake the workload cluster API’s.
+These will run separately in “backing” clusters to avoid interfering with the test (e.g. by using up all the resources in the management cluster).
+Here is a diagram that describes the setup:
+
+
+
+
How are we going to fake the workload cluster API’s then?
+The most obvious solution is to just run the real deal, i.e. the kube-apiserver.
+This is what would be run in a real workload cluster, together with the other components that make up the Kubernetes control plane.
+
+
If you want to follow along and try to set this up yourself, you will need at least the following tools installed:
This has been tested with Kubernetes v1.25, kind v0.19 and clusterctl v1.4.2.
+All script snippets are assumed to be for the bash shell.
+
+
Running the Kubernetes API server
+
+
There are many misconceptions, maybe even superstitions, about the Kubernetes control plane.
+The fact is that it is in no way special.
+It consists of a few programs that can be run in any way you want: in a container, as a systemd unit or directly executed at the command line.
+They can run on a Node or outside of the cluster.
+You can even run multiple instances on the same host as long as you avoid port collisions.
+
+
For our purposes we basically want to run as little as possible of the control plane components.
+We just need the API to be available and possible for us to populate with data that the controllers expect to be there.
+In other words, we need the API server and etcd.
+The scheduler is not necessary since we won’t run any actual workload (we are just pretending the Nodes are there anyway) and the controller manager would just get in the way when we want to fake resources.
+It would, for example, try to update the status of the (fake) Nodes that we want to create.
+
+
The API server will need an etcd instance to connect to.
+It will also need some TLS configuration, both for connecting to etcd and for handling service accounts.
+One simple way to generate the needed certificates is to use kubeadm.
+But before we get there we need to think about how the configuration should look like.
+
+
For simplicity, we will simply run the API server and etcd in a kind cluster for now.
+It would then be easy to run them in some other Kubernetes cluster later if needed.
+Let’s create it right away:
+
+
kind create cluster
+# Note: This has been tested with node image
+# kindest/node:v1.26.3@sha256:61b92f38dff6ccc29969e7aa154d34e38b89443af1a2c14e6cfbd2df6419c66f
+
+
+
To try to cut down on the resources required, we will also use a single multi-tenant etcd instance instead of one per API server.
+We can rely on the internal service discovery so the API server can find etcd via an address like etcd-server.etd-system.svc.cluster.local, instead of using IP addresses.
+Finally, we will need an endpoint where the API is exposed to the cluster where the controllers are running, but for now we can focus on just getting it up and running with 127.0.0.1:6443 as the endpoint.
+
+
Based on the above, we can create a kubeadm-config.yaml file like this:
As mentioned before, we want to create a multi-tenant etcd that many API servers can share.
+For this reason, we will need to create a root user and enable authentication for etcd:
At this point we have a working etcd instance with authentication and TLS enabled.
+Each client will need to have an etcd user to interact with this instance so we need to create an etcd user for the API server.
+We already created a root user before so this should look familiar.
+
+
## Create etcd tenant
+# Create user
+kubectl -n etcd-system exec etcd-0 -- etcdctl --user root:rootpw \
+ --key=/etc/kubernetes/pki/etcd/tls.key --cert=/etc/kubernetes/pki/etcd/tls.crt --cacert /etc/kubernetes/pki/ca/tls.crt \
+ user add test--new-user-password=test
+# Create role
+kubectl -n etcd-system exec etcd-0 -- etcdctl --user root:rootpw \
+ --key=/etc/kubernetes/pki/etcd/tls.key --cert=/etc/kubernetes/pki/etcd/tls.crt --cacert /etc/kubernetes/pki/ca/tls.crt \
+ role add test
+# Add read/write permissions for prefix to the role
+kubectl -n etcd-system exec etcd-0 -- etcdctl --user root:rootpw \
+ --key=/etc/kubernetes/pki/etcd/tls.key --cert=/etc/kubernetes/pki/etcd/tls.crt --cacert /etc/kubernetes/pki/ca/tls.crt \
+ role grant-permission test--prefix=true readwrite "/test/"
+# Give the user permissions from the role
+kubectl -n etcd-system exec etcd-0 -- etcdctl --user root:rootpw \
+ --key=/etc/kubernetes/pki/etcd/tls.key --cert=/etc/kubernetes/pki/etcd/tls.crt --cacert /etc/kubernetes/pki/ca/tls.crt \
+ user grant-role test test
+
+
+
From etcd’s point of view, everything is now ready.
+The API server could theoretically use etcdctl and authenticate with the username and password that we created for it.
+However, that is not how the API server works.
+It expects to be able to authenticate using client certificates.
+Luckily, etcd supports this so we just have to generate the certificates and sign them so that etcd trusts them.
+The key thing is to set the common name in the certificate to the name of the user we want to authenticate as.
+
+
Since kubeadm always sets the same common name, we will here use openssl to generate the client certificates so that we get control over it.
In order to deploy the API server, we will first need to generate some more certificates.
+The client certificates for connecting to etcd are already ready, but it also needs certificates to secure the exposed API itself, and a few other things.
+Then we will also need to create secrets from all of these certificates:
Time to check if it worked!
+We can use port-forwarding to access the API, but of course we will need some authentication method for it to be useful.
+With kubeadm we can generate a kubeconfig based on the certificates we already have.
Note that it won’t have any Nodes or Pods running.
+It is completely empty since it is running on its own.
+There is no kubelet that registered as a Node or applied static manifests, there is no scheduler or controller manager.
+Exactly like we want it.
+
+
Faking Nodes and other resources
+
+
Let’s take a step back and think about what we have done so far.
+We have deployed a Kubernetes API server and a multi-tenant etcd instance.
+More API servers can be added in the same way, so it is straight forward to scale.
+All of it runs in a kind cluster, which means that it is easy to set up and we can switch to any other Kubernetes cluster if needed later.
+Through Kubernetes we also get an easy way to access the API servers by using port-forwarding, without exposing all of them separately.
+
+
The time has now come to think about what we need to put in the workload cluster API to convince the Cluster API and Metal3 controllers that it is healthy.
+First of all they will expect to see Nodes that match the Machines and that they have a provider ID set.
+Secondly, they will expect to see healthy control plane Pods.
+Finally, they will try to check on the etcd cluster.
+
+
The final point is a problem, but we can work around it for now by configuring external etcd.
+It will lead to a different code path for the bootstrap and control plane controllers, but until we have something better it will be a good enough test.
+
+
Creating the Nodes and control plane Pods is really easy though.
+We are just adding resources and there are no controllers or validating web hooks that can interfere.
+Try it out!
+
+
# Create a Node
+kubectl --kubeconfig=kubeconfig.yaml create -f https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/fake-node.yaml
+# Check that it worked
+kubectl --kubeconfig=kubeconfig.yaml get nodes
+# Maybe label it as part of the control plane?
+kubectl --kubeconfig=kubeconfig.yaml label node fake-node node-role.kubernetes.io/control-plane=""
+
+
+
Now add a Pod:
+
+
kubectl --kubeconfig=kubeconfig.yaml create -f https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/kube-apiserver-pod.yaml
+# Set status on the pods (it is not added when using create/apply).
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/kube-apiserver-pod-status.yaml |
+ kubectl --kubeconfig=kubeconfig.yaml -n kube-system patch pod kube-apiserver-node-name \
+ --subresource=status --patch-file=/dev/stdin
+
+
+
You should be able to see something like this:
+
+
$kubectl --kubeconfig kubeconfig.yaml get pods -A
+NAMESPACE NAME READY STATUS RESTARTS AGE
+kube-system kube-apiserver-node-name 1/1 Running 0 16h
+$kubectl --kubeconfig kubeconfig.yaml get nodes
+NAME STATUS ROLES AGE VERSION
+fake-node Ready <none>16h v1.25.3
+
+
+
Now all we have to do is to ensure that the API returns information that the controllers expect.
+
+
Hooking up the API server to a Cluster API cluster
+
+
We will now set up a fresh cluster where we can run the Cluster API and Metal3 controllers.
+
+
# Delete the previous cluster
+kind delete cluster
+# Create a fresh new cluster
+kind create cluster
+# Initialize Cluster API with Metal3
+clusterctl init --infrastructure metal3
+## Deploy the Bare Metal Opearator
+# Create the namespace where it will run
+kubectl create ns baremetal-operator-system
+# Deploy it in normal mode
+kubectl apply -k https://github.com/metal3-io/baremetal-operator/config/default
+# Patch it to run in test mode
+kubectl patch -n baremetal-operator-system deploy baremetal-operator-controller-manager --type=json \
+ -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--test-mode"}]'
+
+
+
You should now have a cluster with the Cluster API, Metal3 provider and Bare Metal Operator running.
+Next, we will prepare some files that will come in handy later, namely a cluster template, BareMetalHost manifest and Kubeadm configuration file.
With this we have enough to start creating the workload cluster.
+First, we need to set up some certificates.
+This should look very familiar from earlier when we created certificates for the Kubernetes API server and etcd.
We are now ready to create the cluster!
+We just need a few variables for the template.
+The important part here is the CLUSTER_APIENDPOINT_HOST and CLUSTER_APIENDPOINT_PORT, since this will be used by the controllers to connect to the workload cluster API.
+You should set the IP to the private IP of the test machine or similar.
+This way we can use port-forwarding to expose the API on this IP, which the controllers can then reach.
+The port just have to be one not in use, and preferably something that is easy to remember and associate with the correct cluster.
+For example, cluster 1 gets port 10001, cluster 2 gets 10002, etc.
This will give you a cluster and all the templates and other resources that are needed.
+However, we will need to fill in for the non-existent hardware and create the workload cluster API server, like we practiced before.
+This time it is slightly different, because some of the steps are handled by the Cluster API.
+We just need to take care of what would happen on the node, plus the etcd part since we are using external etcd configuration.
+
+
mkdir-p"/tmp/${CLUSTER}/pki/etcd"
+
+# Generate etcd client certificate
+openssl req -newkey rsa:2048 -nodes-subj"/CN=${CLUSTER}"\
+ -keyout"/tmp/${CLUSTER}/pki/apiserver-etcd-client.key"-out"/tmp/${CLUSTER}/pki/apiserver-etcd-client.csr"
+openssl x509 -req-in"/tmp/${CLUSTER}/pki/apiserver-etcd-client.csr"\
+ -CA /tmp/pki/etcd/ca.crt -CAkey /tmp/pki/etcd/ca.key -CAcreateserial\
+ -out"/tmp/${CLUSTER}/pki/apiserver-etcd-client.crt"-days 365
+
+# Get the k8s ca certificate and key.
+# This is used by kubeadm to generate the api server certificates
+kubectl -n"${NAMESPACE}" get secrets "${CLUSTER}-ca"-ojsonpath="{.data.tls\.crt}" | base64-d>"/tmp/${CLUSTER}/pki/ca.crt"
+kubectl -n"${NAMESPACE}" get secrets "${CLUSTER}-ca"-ojsonpath="{.data.tls\.key}" | base64-d>"/tmp/${CLUSTER}/pki/ca.key"
+
+# Generate certificates
+sed-e"s/NAMESPACE/${NAMESPACE}/g"-e"s/CLUSTER/${CLUSTER}/g"-e"s/HOST/${CLUSTER_APIENDPOINT_HOST}/g"\
+ /tmp/kubeadm-config-template.yaml >"/tmp/kubeadm-config-${CLUSTER}.yaml"
+kubeadm init phase certs apiserver --config"/tmp/kubeadm-config-${CLUSTER}.yaml"
+
+# Create secrets
+kubectl -n"${NAMESPACE}" create secret tls "${CLUSTER}-apiserver-etcd-client"--cert"/tmp/${CLUSTER}/pki/apiserver-etcd-client.crt"--key"/tmp/${CLUSTER}/pki/apiserver-etcd-client.key"
+kubectl -n"${NAMESPACE}" create secret tls apiserver --cert"/tmp/${CLUSTER}/pki/apiserver.crt"--key"/tmp/${CLUSTER}/pki/apiserver.key"
+
+
+
Now we will need to set up the fake cluster resources.
+For this we will create a second kind cluster and set up etcd, just like we did before.
Switch the context back to the first cluster with kubectl config use-context kind-kind so we don’t get confused about which is the main cluster.
+We will now need to put all the expected certificates for the fake cluster in the kind-backing-cluster-1 so that they can be used by the API server that we will deploy there.
+
+
CLUSTER=test-1
+NAMESPACE=test-1
+# Setup fake resources for cluster test-1
+kubectl --context=kind-backing-cluster-1 create namespace "${NAMESPACE}"
+kubectl --context=kind-backing-cluster-1 -n"${NAMESPACE}" create secret tls "${CLUSTER}-etcd"--cert /tmp/pki/etcd/ca.crt --key /tmp/pki/etcd/ca.key
+kubectl --context=kind-backing-cluster-1 -n"${NAMESPACE}" create secret tls "${CLUSTER}-ca"--cert /tmp/pki/ca.crt --key /tmp/pki/ca.key
+kubectl --context=kind-backing-cluster-1 -n"${NAMESPACE}" create secret tls "${CLUSTER}-apiserver-etcd-client"--cert"/tmp/${CLUSTER}/pki/apiserver-etcd-client.crt"--key"/tmp/${CLUSTER}/pki/apiserver-etcd-client.key"
+kubectl --context=kind-backing-cluster-1 -n"${NAMESPACE}" create secret tls apiserver --cert"/tmp/${CLUSTER}/pki/apiserver.crt"--key"/tmp/${CLUSTER}/pki/apiserver.key"
+
+kubectl -n"${NAMESPACE}" get secrets "${CLUSTER}-sa"-o yaml | kubectl --context=kind-backing-cluster-1 create -f -
+
+## Create etcd tenant
+# Create user
+kubectl --context=kind-backing-cluster-1 -n etcd-system exec etcd-0 -- etcdctl --user root:rootpw \
+ --key=/etc/kubernetes/pki/etcd/tls.key --cert=/etc/kubernetes/pki/etcd/tls.crt --cacert /etc/kubernetes/pki/ca/tls.crt \
+ user add "${CLUSTER}"--new-user-password="${CLUSTER}"
+# Create role
+kubectl --context=kind-backing-cluster-1 -n etcd-system exec etcd-0 -- etcdctl --user root:rootpw \
+ --key=/etc/kubernetes/pki/etcd/tls.key --cert=/etc/kubernetes/pki/etcd/tls.crt --cacert /etc/kubernetes/pki/ca/tls.crt \
+ role add "${CLUSTER}"
+# Add read/write permissions for prefix to the role
+kubectl --context=kind-backing-cluster-1 -n etcd-system exec etcd-0 -- etcdctl --user root:rootpw \
+ --key=/etc/kubernetes/pki/etcd/tls.key --cert=/etc/kubernetes/pki/etcd/tls.crt --cacert /etc/kubernetes/pki/ca/tls.crt \
+ role grant-permission "${CLUSTER}"--prefix=true readwrite "/${CLUSTER}/"
+# Give the user permissions from the role
+kubectl --context=kind-backing-cluster-1 -n etcd-system exec etcd-0 -- etcdctl --user root:rootpw \
+ --key=/etc/kubernetes/pki/etcd/tls.key --cert=/etc/kubernetes/pki/etcd/tls.crt --cacert /etc/kubernetes/pki/ca/tls.crt \
+ user grant-role "${CLUSTER}""${CLUSTER}"
+
+
+
Check that the Metal3Machine is associated with a BareMetalHost.
+Deploy the API server.
+
+
# Deploy API server
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/manifests/v2/kube-apiserver-deployment.yaml |
+ sed-e"s/CLUSTER/${CLUSTER}/g" | kubectl --context=kind-backing-cluster-1 -n"${NAMESPACE}" apply -f -
+kubectl --context=kind-backing-cluster-1 -n"${NAMESPACE}"wait--for=condition=Available deploy/test-kube-apiserver
+
+# Get kubeconfig
+clusterctl -n"${NAMESPACE}" get kubeconfig "${CLUSTER}">"/tmp/kubeconfig-${CLUSTER}.yaml"
+# Edit kubeconfig to point to 127.0.0.1:${CLUSTER_APIENDPOINT_PORT}
+sed-i-e"s/${CLUSTER_APIENDPOINT_HOST}/127.0.0.1/"-e"s/:6443/:${CLUSTER_APIENDPOINT_PORT}/""/tmp/kubeconfig-${CLUSTER}.yaml"
+# Port forward for accessing the API
+kubectl --context=kind-backing-cluster-1 -n"${NAMESPACE}" port-forward \
+ --address"${CLUSTER_APIENDPOINT_HOST},127.0.0.1" svc/test-kube-apiserver "${CLUSTER_APIENDPOINT_PORT}":6443 &
+# Check that it is working
+kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml" cluster-info
+
+
+
Now that we have a working API for the workload cluster, the only remaining thing is to put everything that the controllers expect in it.
+This includes adding a Node to match the Machine as well as static pods that Cluster API expects to be there.
+Let’s start with the Node!
+The Node must have the correct name and a label with the BareMetalHost UID so that the controllers can put the correct provider ID on it.
+We have only created 1 BareMetalHost so it is easy to pick the correct one.
+The name of the Node should be the same as the Machine, which is also only a single one.
+
+
machine="$(kubectl -n"${NAMESPACE}" get machine -ojsonpath="{.items[0].metadata.name}")"
+bmh_uid="$(kubectl -n"${NAMESPACE}" get bmh -ojsonpath="{.items[0].metadata.uid}")"
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/fake-node.yaml |
+ sed-e"s/fake-node/${machine}/g"-e"s/fake-uuid/${bmh_uid}/g" | \
+ kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml" create -f -
+# Label it as control-plane since this is a control-plane node.
+kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml" label node "${machine}" node-role.kubernetes.io/control-plane=""
+# Upload kubeadm config to configmap. This will mark the KCP as initialized.
+kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml"-n kube-system create cm kubeadm-config \
+ --from-file=ClusterConfiguration="/tmp/kubeadm-config-${CLUSTER}.yaml"
+
+
+
This should be enough to make the Machines healthy!
+You should be able to see something similar to this:
However, if you check the KubeadmControlPlane more carefully, you will notice that it is still complaining about control plane components.
+This is because we have not created the static pods yet, and it is also unable to check the certificate expiration date for the Machine.
+Let’s fix it:
+
+
# Add static pods to make kubeadm control plane manager happy
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/kube-apiserver-pod.yaml |
+ sed"s/node-name/${machine}/g" |
+ kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml" create -f -
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/kube-controller-manager-pod.yaml |
+ sed"s/node-name/${machine}/g" |
+ kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml" create -f -
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/kube-scheduler-pod.yaml |
+ sed"s/node-name/${machine}/g" |
+ kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml" create -f -
+# Set status on the pods (it is not added when using create/apply).
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/kube-apiserver-pod-status.yaml |
+ kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml"-n kube-system patch pod "kube-apiserver-${machine}"\
+ --subresource=status --patch-file=/dev/stdin
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/kube-controller-manager-pod-status.yaml |
+ kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml"-n kube-system patch pod "kube-controller-manager-${machine}"\
+ --subresource=status --patch-file=/dev/stdin
+curl -L https://github.com/Nordix/metal3-clusterapi-docs/raw/main/metal3-scaling-experiments/kube-scheduler-pod-status.yaml |
+ kubectl --kubeconfig="/tmp/kubeconfig-${CLUSTER}.yaml"-n kube-system patch pod "kube-scheduler-${machine}"\
+ --subresource=status --patch-file=/dev/stdin
+
+# Add certificate expiry annotations to make kubeadm control plane manager happy
+CERT_EXPIRY_ANNOTATION="machine.cluster.x-k8s.io/certificates-expiry"
+EXPIRY_TEXT="$(kubectl -n"${NAMESPACE}" get secret apiserver -ojsonpath="{.data.tls\.crt}" | base64-d | openssl x509 -enddate-noout | cut-d=-f 2)"
+EXPIRY="$(date--date="${EXPIRY_TEXT}"--iso-8601=seconds)"
+kubectl -n"${NAMESPACE}" annotate machine "${machine}""${CERT_EXPIRY_ANNOTATION}=${EXPIRY}"
+kubectl -n"${NAMESPACE}" annotate kubeadmconfig --all"${CERT_EXPIRY_ANNOTATION}=${EXPIRY}"
+
+
+
Now we finally have a completely healthy cluster as far as the controllers are concerned.
+
+
Conclusions and summary
+
+
We now have all the tools necessary to start experimenting.
+
+
+
With the BareMetal Operator running in test mode, we can skip Ironic and still work with BareMetalHosts that act like normal.
+
We can set up separate “backing” clusters where we run etcd and multiple API servers to fake the workload cluster API’s.
+
Fake Nodes and Pods can be easily added to the workload cluster API’s, and configured as we want.
+
The workload cluster API’s can be exposed to the controllers in the test cluster using port-forwarding.
+
+
+
In this post we have not automated all of this, but if you want to see a scripted setup, take a look at this.
+It is what we used to scale to 1000 clusters.
+Just remember that it may need some tweaking for your specific environment if you want to try it out!
+
+
Specifically we used 10 “backing” clusters, i.e. 10 separate cloud VMs with kind clusters where we run etcd and the workload cluster API’s.
+Each one would hold 100 API servers.
+The test cluster was on its own separate VM also running a kind cluster with all the controllers and all the Cluster objects, etc.
+
+
In the next and final blog post of this series we will take a look at the results of all this.
+What issues did we run into along the way?
+How did we fix or work around them?
+We will also take a look at what is going on in the community related to this and discuss potential future work in the area.
]]>Lennart JernScaling to 1000 clusters - Part 12023-05-05T00:00:00-05:002023-05-05T00:00:00-05:00https://metal3.io/blog/2023/05/05/Scaling_part_1We want to ensure that Metal3 can scale to thousands of nodes and clusters.
+However, running tests with thousands of real servers is expensive and we don’t have access to any such large environment in the project.
+So instead we have been focusing on faking the hardware while trying to keep things as realistic as possible for the controllers.
+In this first part we will take a look at the Bare Metal Operator and the test mode it offers.
+The next part will be about how to fake the Kubernetes API of the workload clusters.
+In the final post we will take a look at the issues we ran into and what is being done in the community to address them so that we can keep scaling!
+
+
Some background on how to fool the controllers
+
+
With the full Metal3 stack, from Ironic to Cluster API, we have the following controllers that operate on Kubernetes APIs:
+
+
+
Cluster API Kubeadm control plane controller
+
Cluster API Kubeadm bootstrap controller
+
Cluster API controller
+
Cluster API provider for Metal3 controller
+
IP address manager controller
+
Bare Metal Operator controller
+
+
+
We will first focus on the controllers that interact with Nodes, Machines, Metal3Machines and BareMetalHosts, i.e. objects related to actual physical machines that we need to fake.
+In other words, we are skipping the IP address manager for now.
+
+
What do these controllers care about really?
+What do we need to do to fool them?
+At the Cluster API level, the controllers just care about the Kubernetes resources in the management cluster (e.g. Clusters and Machines) and some resources in the workload cluster (e.g. Nodes and the etcd Pods).
+The controllers will try to connect to the workload clusters in order to check the status of the resources there, so if there is no real workload cluster, this is something we will need to fake if we want to fool the controllers.
+When it comes to Cluster API provider for Metal3, it connects the abstract high level objects with the BareMetalHosts, so here we will need to make the BareMetalHosts to behave realistically in order to provide a good test.
+
+
This is where the Bare Metal Operator test mode comes in.
+If we can fake the workload cluster API and the BareMetalHosts, then all the Cluster API controllers and the Metal3 provider will get a realistic test that we can use when working on scalability.
+
+
Bare Metal Operator test mode
+
+
The Bare Metal Operator has a test mode, in which it doesn’t talk to Ironic.
+Instead it just pretends that everything is fine and all actions succeed.
+In this mode the BareMetalHosts will move through the state diagram just like they normally would (but quite a bit faster).
+To enable it, all you have to do is add the -test-mode flag when running the Bare Metal Operator controller.
+For convenience there is also a make target (make run-test-mode) that will run the Bare Metal Operator directly on the host in test mode.
+
+
Here is an example of how to use it.
+You will need kind and kubectl installed for this to work, but you don’t need the Bare Metal Operator repository cloned.
+
+
+
+
Create a kind cluster and deploy cert-manager (needed for web hook certificates):
# Create the namespace where it will run
+kubectl create ns baremetal-operator-system
+# Deploy it in normal mode
+kubectl apply -k https://github.com/metal3-io/baremetal-operator/config/default
+# Patch it to run in test mode
+kubectl patch -n baremetal-operator-system deploy baremetal-operator-controller-manager --type=json \
+ -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--test-mode"}]'
+
+
+
+
In a separate terminal, create a BareMetalHost from the example manifests:
After applying the BareMetalHost, it will quickly go through registering and become available.
+
+
$kubectl get bmh
+NAME STATE CONSUMER ONLINE ERROR AGE
+example-baremetalhost registering true 2s
+$kubectl get bmh
+NAME STATE CONSUMER ONLINE ERROR AGE
+example-baremetalhost available true 6s
+
+
+
We can now provision the BareMetalHost, turn it off, deprovision, etc.
+Just like normal, except that the machine doesn’t exist.
+Let’s try provisioning it!
You will see it go through provisioning and end up in provisioned state:
+
+
$kubectl get bmh
+NAME STATE CONSUMER ONLINE ERROR AGE
+example-baremetalhost provisioning true 7m20s
+
+$kubectl get bmh
+NAME STATE CONSUMER ONLINE ERROR AGE
+example-baremetalhost provisioned true 7m22s
+
+
+
Wrapping up
+
+
With Bare Metal Operator in test mode, we have the foundation for starting our scalability journey.
+We can easily create BareMetalHost objects and they behave similar to what they would in a real scenario.
+A simple bash script will at this point allow us to create as many BareMetalHosts as we would like.
+To wrap things up, we will now do just that: put together a script and try generating a few BareMetalHosts.
+
+
The script will do the same thing we did before when creating the example BareMetalHost, but it will also give them different names so we don’t get naming collisions.
+Here it is:
Save it as produce-available-hosts.sh and try it out:
+
+
$./produce-available-hosts.sh 10 | kubectl apply -f -
+secret/worker-1-bmc-secret created
+baremetalhost.metal3.io/worker-1 created
+secret/worker-2-bmc-secret created
+baremetalhost.metal3.io/worker-2 created
+secret/worker-3-bmc-secret created
+baremetalhost.metal3.io/worker-3 created
+secret/worker-4-bmc-secret created
+baremetalhost.metal3.io/worker-4 created
+secret/worker-5-bmc-secret created
+baremetalhost.metal3.io/worker-5 created
+secret/worker-6-bmc-secret created
+baremetalhost.metal3.io/worker-6 created
+secret/worker-7-bmc-secret created
+baremetalhost.metal3.io/worker-7 created
+secret/worker-8-bmc-secret created
+baremetalhost.metal3.io/worker-8 created
+secret/worker-9-bmc-secret created
+baremetalhost.metal3.io/worker-9 created
+secret/worker-10-bmc-secret created
+baremetalhost.metal3.io/worker-10 created
+$kubectl get bmh
+NAME STATE CONSUMER ONLINE ERROR AGE
+worker-1 registering true 2s
+worker-10 available true 2s
+worker-2 available true 2s
+worker-3 available true 2s
+worker-4 available true 2s
+worker-5 available true 2s
+worker-6 registering true 2s
+worker-7 available true 2s
+worker-8 available true 2s
+worker-9 available true 2s
+
+
+
With this we conclude the first part of the scaling series.
+In the next post, we will take a look at how to fake the other end of the stack: the workload cluster API.
]]>Lennart JernOne cluster - multiple providers2022-07-08T00:00:00-05:002022-07-08T00:00:00-05:00https://metal3.io/blog/2022/07/08/One_cluster_multiple_providersRunning on bare metal has both benefits and drawbacks. You can get the
+best performance possible out of the hardware, but it can also be quite
+expensive and maybe not necessary for all workloads. Perhaps a hybrid
+cluster could give you the best of both? Raw power for the workload that
+needs it, and cheap virtualized commodity for the rest. This blog post
+will show how to set up a cluster like this using the Cluster API backed
+by the Metal3 and BYOH providers.
+
+
The problem
+
+
Imagine that you have some bare metal servers that you want to use for
+some specific workload. Maybe the workload benefits from the specific
+hardware or there are some requirements that make it necessary to run it
+there. The rest of the organization already uses Kubernetes and the
+cluster API everywhere so of course you want the same for this as well.
+Perfect, grab Metal³ and start working!
+
+
But hold on, this would mean that you use some of the servers for
+running the Kubernetes control plane and possibly all the cluster API
+controllers. If there are enough servers this is probably not an issue,
+but do you really want to “waste” these servers on such generic
+workloads that could be running anywhere? This can become especially
+painful if you need multiple control plane nodes. Each server is
+probably powerful enough to run all the control planes and controllers,
+but it would be a single point of failure…
+
+
What if there was a way to use a different cluster API infrastructure
+provider for some nodes? For example, use the Openstack infrastructure
+provider for the control plane and Metal³ for the workers. Let’s do an
+experiment!
+
+
Setting up the experiment environment
+
+
This blog post will use the Bring your own
+host
+(BYOH) provider together with Metal³ as a proof of concept to show what
+is currently possible.
+
+
The BYOH provider was chosen as the second provider for two reasons:
+
+
+
Due to its design (you provision the host yourself), it is very easy
+to adapt it to the test (e.g. use a VM in the same network that the
+metal3-dev-env uses).
+
It is one of the providers that is known to work when combining
+multiple providers for a single cluster.
+
+
+
We will be using the
+metal3-dev-env on Ubuntu
+as a starting point for this experiment. Note that it makes substantial
+changes to the machine where it is running, so you may want to use a
+dedicated lab machine instead of your laptop for this. If you have not
+done so already, clone it and run make. This should give you a
+management cluster with the Metal³ provider installed and two
+BareMetalHosts ready for provisioning.
+
+
The next step is to add the BYOH provider and a ByoHost.
+
+
clusterctl init --infrastructure byoh
+
+
+
For the ByoHost we will use Vagrant.
+You can install it with sudo apt install vagrant.
+Then copy the Vagrantfile below to a new folder and run vagrant up.
+
+
# -*- mode: ruby -*-
+hosts = {
+ "control-plane1" => { "memory" => 2048, "ip" => "192.168.10.10"},
+ # "control-plane2" => { "memory" => 2048, "ip" => "192.168.10.11"},
+ # "control-plane3" => { "memory" => 2048, "ip" => "192.168.10.12"},
+}
+
+
+Vagrant.configure("2") do |config|
+ # Choose which box you want below
+ config.vm.box = "generic/ubuntu2004"
+ config.vm.synced_folder ".", "/vagrant", disabled: true
+ config.vm.provider :libvirt do |libvirt|
+ # QEMU system connection is required for private network configuration
+ libvirt.qemu_use_session = false
+ end
+
+
+ # Loop over all machine names
+ hosts.each_key do |host|
+ config.vm.define host, primary: host == hosts.keys.first do |node|
+ node.vm.hostname = host
+ node.vm.network :private_network, ip: hosts[host]["ip"],
+ libvirt__forward_mode: "route"
+ node.vm.provider :libvirt do |lv|
+ lv.memory = hosts[host]["memory"]
+ lv.cpus = 2
+ end
+ end
+ end
+end
+
+
+
Vagrant should now have created a new VM to use as a ByoHost. Now we
+just need to run the BYOH agent in the VM to make it register as a
+ByoHost in the management cluster. The BYOH agent needs a kubeconfig
+file to do this, so we start by copying it to the VM:
+
+
+
+
cp ~/.kube/config ~/.kube/management-cluster.conf
+# Ensure that the correct IP is used (not localhost)
+export KIND_IP=$(docker inspect -f'{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' kind-control-plane)
+sed-i's/ server\:.*/ server\: https\:\/\/'"$KIND_IP"'\:6443/g' ~/.kube/management-cluster.conf
+scp -i .vagrant/machines/control-plane1/libvirt/private_key \
+ /home/ubuntu/.kube/management-cluster.conf vagrant@192.168.10.10:management-cluster.conf
+
+
+
+
+
+
Next, install the prerequisites and host agent in the VM and run it.
You should now have a management cluster with both the Metal³ and BYOH
+providers installed, as well as two BareMetalHosts and one ByoHost.
+
+
$kubectl -n metal3 get baremetalhosts,byohosts
+NAME STATE CONSUMER ONLINE ERROR AGE
+baremetalhost.metal3.io/node-0 available true 18m
+baremetalhost.metal3.io/node-1 available true 18m
+
+
+NAME AGE
+byohost.infrastructure.cluster.x-k8s.io/control-plane1 73s
+
+
+
Creating a multi-provider cluster
+
+
The trick is to create both a Metal3Cluster and a ByoCluster that are
+owned by one common Cluster. We will use the ByoCluster for the control
+plane in this case. First the Cluster:
Add the rest of the BYOH manifests to get a control plane.
+The code is collapsed here for easier reading.
+Please click on the line below to expand it.
So far this is a “normal” Cluster backed by the BYOH provider. But now
+it is time to do something different. Instead of adding more ByoHosts as
+workers, we will add a Metal3Cluster and MachineDeployment backed by
+BareMetalHosts! Note that the controlPlaneEndpoint of the
+Metal3Cluster must point to the same endpoint that the ByoCluster is
+using.
These manifests are quite large but they are just the same as would be
+used by the metal3-dev-env with some name changes here and there. The
+key thing to note is that all references to a Cluster are to the one we
+defined above. Here is the MachineDeployment:
Finally, we add the Metal3MachineTemplate, Metal3DataTemplate and
+KubeadmConfigTemplate. Here you may want to add your public ssh key in
+the KubeadmConfigTemplate (the last few lines).
$kubectl get nodes
+NAME STATUS ROLES AGE VERSION
+control-plane1 Ready control-plane,master 88m v1.23.5
+test1-8767dbccd-24cl5 Ready <none>82m v1.23.5
+
+
+
Going back to the management cluster, we can inspect the state of the
+cluster API resources.
As we have seen in this post, it is possible to combine at least some
+infrastructure providers when creating a single cluster. This can be
+useful for example if a provider has a high cost or limited resources.
+Furthermore, the use case is not addressed by MachineDeployments since
+they would all be from the same provider (even though they can have
+different properties).
+
+
There is some room for development and improvement though. The most
+obvious thing is perhaps that Clusters only have one
+infrastructureRef. This means that the cluster API controllers are not
+aware of the “secondary” infrastructure provider(s).
+
+
Another thing that may be less obvious is the reliance on Nodes and
+Machines in the Kubeadm control plane provider. It is not an issue in
+the example we have seen here since both Metal³ and BYOH creates Nodes.
+However, there are some projects where Nodes are unnecessary. See for
+example Kamaji, which aims to
+integrate with the cluster API. The idea here is to run the control
+plane components in the management cluster as Pods. Naturally, there
+would not be any control plane Nodes or Machines in this case. (A second
+provider would be used to add workers.) But the Kubeadm control plane
+provider expects there to be both Machines and Nodes for the control
+plane, so a new provider is likely needed to make this work as desired.
+
+
This issue can already be seen in the
+vcluster
+provider, where the Cluster stays in Provisioning state because it is
+“Waiting for the first control plane machine to have its
+status.nodeRef set”. The idea with vcluster is to reuse the Nodes of
+the management cluster but provide a separate control plane. This gives
+users better isolation than just namespaces without the need for another
+“real” cluster. It is for example possible to have different custom
+resource definitions in each vcluster. But since vcluster runs all the
+pods (including the control plane) in the management cluster, there will
+never be a control plane Machine or nodeRef.
+
+
There is already one implementation of a control plane provider without
+Nodes, i.e. the EKS provider. Perhaps this is the way forward. One
+implementation for each specific case. It would be nice if it was
+possible to do it in a more generic way though, similar to how the
+Kubeadm control plane provider is used by almost all infrastructure
+providers.
+
+
To summarize, there is already some support for mixed clusters with
+multiple providers. However, there are some issues that make it
+unnecessarily awkward. Two things that could be improved in the cluster
+API would be the following:
+
+
+
Make the cluster.infrastructureRef into a list to allow multiple
+infrastructure providers to be registered.
+
Drop the assumption that there will always be control plane Machines
+and Nodes (e.g. by implementing a new control plane provider).
+]]>Lennart JernMetal3 Introduces Pivoting2021-05-05T00:00:00-05:002021-05-05T00:00:00-05:00https://metal3.io/blog/2021/05/05/PivotingMetal3 project has introduced pivoting in its CI workflow. The motivation for
+pivoting is to move all the objects from the ephemeral/management
+cluster to a target cluster. This blog post will briefly introduce the concept
+of pivoting and the impact it has on the overall CI workflow. For the rest of
+this blog, we refer ephemeral/management cluster as an ephemeral cluster.
+
+
What is Pivoting?
+
+
In the context of Metal3 Provider, Pivoting is the process of moving
+Cluster-API and Metal3 objects from the ephemeral k8s cluster to a target
+cluster. In Metal3, this process is performed using the
+clusterctl tool
+provided by Cluster-API. clusterctl recognizes pivoting as a move. During the
+pivot process, clusterctl pauses any reconciliation of Cluster-API objects and
+this gets propagated to Cluster-api-provider-metal3 (CAPM3) objects as well.
+Once all the objects are paused, the objects are created on the other side on
+the target cluster and deleted from the ephemeral cluster.
+
+
Prerequisites
+
+
Prior to the actual pivot process, the target cluster should already have the
+provider components, ironic containers and CNI installed and running. To perform
+pivot outside metal3-dev-env, specifically, the following points need to be
+addressed:
+
+
+
clusterctl is used to initialize both the ephemeral and target cluster.
+
BMH objects have correct status annotation.
+
Maintain connectivity towards the provisioning network.
+
Baremetal Operator(BMO) is deployed as part of CAPM3.
+
Objects should have a proper owner reference chain.
+
+
+
For a detailed explanation of the above-mentioned prerequisites please read the
+pivoting documentation.
+
+
Pivoting workflow in CI
+
+
The Metal3 CI currently includes pivoting as part of the deployment
+process both for Ubuntu and CentOS-based jobs. This essentially means all
+the PRs that go in, are tested through the pivoting workflow. Here is the
+CI deployment workflow:
+
+
+
make the metal3-dev-env.
+It gives us the ephemeral cluster with all the necessary controllers running
+within it. The corresponding metal3-dev-env command is make
+
provision target cluster. For normal integration tests, this step deploys
+a control-plane node and a worker in the target cluster. For, feature-test
+and feature-test-upgrade the provision step deploys three control-planes and
+a worker. The corresponding metal3-dev-env commands are (normal integration
+test workflow):
Initialize the provider components on the target cluster. This installs all
+the controllers and associated components related to cluster-api ,
+cluster-api-provider-metal3, baremetal-operator and ironic. Since it is
+necessary to have only one set of ironic deployment/containers in the picture,
+this step also deletes the ironic deployment/containers from
+ephemeral cluster.
+
Move all the objects from ephemeral to the target cluster.
+
Check the status of the objects to verify whether the objects are being
+reconciled correctly by the controllers in the target cluster. This step
+verifies and finalizes the pivoting process. The corresponding metal3-dev-env
+the command that performs this and the previous two steps is :
+
+
+
./scripts/feature_tests/pivoting/pivot.sh
+
+
+
+
Move the objects back to the ephemeral cluster. This step also
+removes the ironic deployment from the target cluster and reinstates the
+ironic deployment/containers in the ephemeral cluster. Since we do
+not delete the provider components in the ephemeral cluster,
+installing them again is not necessary. The corresponding metal3-dev-env command
+that performs this step is :
+
+
+
./scripts/feature_tests/pivoting/repivot.sh
+
+
+
+
De-provision the BMHs and delete the target cluster. The corresponding
+metal3-dev-env commands to de-provision worker, controlplane and the cluster
+is as follows:
Note that, if we de-provision cluster, that would de-provision worker and
+controlplane automatically.
+
+
Pivoting in Metal3
+
+
The pivoting process described above is realized in ansible scripts
+move.yml
+and
+move_back.yml.
+Under the hood, pivoting uses the move command from
+clusterctl
+provided by Cluster-API.
+
+
As stated earlier, all the PRs that go into any Metal3 repository where the
+integration tests are run, the code change introduced in the PR is verified with
+pivoting also in the integration tests now. Moreover, the upgrade workflow in
+Metal3 performs all the upgrade operations in Metal3 after pivoting to the
+target cluster.
]]>Kashif Nizam KhanIntroducing the Metal3 IP Address Manager2020-07-06T00:00:00-05:002020-07-06T00:00:00-05:00https://metal3.io/blog/2020/07/06/IP_address_managerAs a part of developing the Cluster API Provider Metal3 (CAPM3) v1alpha4
+release, the Metal3 crew introduced a new project: its own IP Address Manager.
+This blog post will go through the motivations behind such a project, the
+features that it brings, its use in Metal3 and future work.
+
+
What is the IP Address Manager?
+
+
The IP Address Manager (IPAM) is a controller that provides IP addresses and
+manages the allocations of IP subnets. It is not a DHCP server in that it only
+reconciles Kubernetes objects and does not answer any DHCP queries. It
+allocates IP addresses on request but does not handle any use of those
+addresses.
+
+
This sounds like the description of any IPAM system, no? Well, the twist
+is that this manager is based on Kubernetes to specifically handle some
+constraints from Metal3. We will go through the different issues that this
+project tackles.
+
+
When deploying nodes in a bare metal environment, there are a lot of possible
+variations. This project specifically aims to solve cases where static
+IP address configurations are needed. It is designed to specifically address
+this in the Cluster API (CAPI) context.
+
+
CAPI addresses the deployment of Kubernetes clusters and nodes, using
+the Kubernetes API. As such, it uses objects such as Machine Deployments
+(similar to deployments for pods) that takes care of creating the requested
+number of machines, based on templates. The replicas can be increased by the
+user, triggering the creation of new machines based on the provided templates.
+This mechanism does not allow for flexibility to be able to provide static
+addresses for each machine. The manager adds this flexibility by providing
+the address right before provisioning the node.
+
+
In addition, all the resources from the source cluster must support the CAPI
+pivoting, i.e. being copied and recreated in the target cluster. This means
+that all objects must contain all needed information in their spec field to
+recreate the status in the target cluster without losing information. All
+objects must, through a tree of owner references, be attached to the cluster
+object, for the pivoting to proceed properly.
+
+
In a nutshell, the manager provides an IP Address allocation service, based
+on Kubernetes API and fulfilling the needs of Metal3, specifically the
+requirements of CAPI.
+
+
How does it work?
+
+
The manager follows the same logic as the volume allocation in Kubernetes,
+with a claim and an object created for that claim. There are three types of
+objects defined, the IPPool, the IPClaim and the IPAddress objects.
+
+
The IPPool objects contain the definition of the IP subnets from which the
+Addresses are allocated. It supports both IPv4 and IPv6. The subnets can either
+be defined as such or given as start and end IP addresses with a prefix.
+It also supports pre-allocating IP addresses.
Whenever something requires an IP address from the IPPool, it will create an
+IPClaim. The IPClaim contains a pointer to the IPPool and an owner reference
+to the object that created it.
The controller will then reconcile this object and allocate an IP address. It
+will create an IPAddress object representing the allocated address. It will
+then update the IPPool status to list the IP Address and the IPClaim status
+to point to the IPAddress.
The IP Address Manager is used in Metal3 together with the metadata and network
+data templates feature. Each Metal3Machine (M3M) and Metal3MachineTemplate
+(M3MT) is associated with a Metal3DataTemplate that contains metadata and /
+or a network data template that will be rendered for each Metal3Machine. The
+rendered data will then be provided to Ironic. Those templates reference
+IPPool objects. For each Metal3Machine, an IPClaim is created for each
+IPPool, and the templates are rendered with the allocated IPAddress.
+
+
This is how we achieve dynamic IP Address allocations in setups that
+require static configuration, allowing us to use Machine Deployment and Kubeadm
+Control Plane objects from CAPI in hardware labs where DHCP is not supported.
+
+
Since each IPAddress has an owner reference set to its IPClaim object, and
+IPClaim objects have an owner reference set to the Metal3Data object created
+from the Metal3DataTemplate, the owner reference chain links a Metal3Machine to
+all the IPClaim and IPAddress objects were created for it, allowing for CAPI
+pivoting.
+
+
What now?
+
+
The project is fulfilling its basic requirements, but we are looking into
+extending it and covering more use cases. For example, we are looking at
+adding integration with Infoblox and other external IPAM services. Do not
+hesitate to open an issue if you have some ideas for new features!
+ The Metal3 Project's mission is to empower organizations with a flexible,
+ open-source solution for bare metal provisioning that combines
+ the benefits of bare metal performance with the ease of use and
+ automation provided by Kubernetes.
+
+
Goals
+
There are a number of great open source tools for bare metal host provisioning,
+ including Ironic.
+ Metal3 aims to build on these technologies to provide a Kubernetes native API for managing bare
+ metal hosts via a provisioning stack that is also running on Kubernetes. We believe that Kubernetes
+ Native Infrastructure, or managing your infrastructure just like your applications, is a powerful next
+ step in the evolution of infrastructure management.
+
+ The Metal3 project is also building integration with the Kubernetes
+ cluster-api project, allowing Metal3 to be used as an infrastructure
+ backend for Machine objects from the Cluster API. These components integrate
+ seamlessly to leverage the Kubernetes ecosystem and automate the provisioning
+ and management of bare-metal infrastructure.
+
Connect with the Metal3.io community to learn more, contribute ideas, and help build the future of Metal3.io. Learn more about how to make the most of the Metal3io features in your work.
+
As Metal3.io and most of the infrastructure of the Metal3 Project are
+currently hosted by Red Hat Inc., this site falls under the
+Red Hat Privacy Policy.
+All terms of that privacy policy apply to this site. Should we change
+our hosting in the future, this Privacy Policy will be updated.
+
+
How to Contact Us
+
+
If you have any questions about any of these practices or Metal3’s use
+of your personal information, please feel free to contact
+us or file an
+Issue
+in our GitHub repo.
+
+
Metal3 will work with you to resolve any concerns you may have about
+this Statement.
+
+
Changes to this Privacy Statement
+
+
Metal3 reserves the right to change this policy from time to time. If we
+do make changes, the revised Privacy Statement will be posted on this
+site. A notice will be posted on our blog and/or mailing lists whenever
+this privacy statement is changed in a material way.
+
+
This Privacy Statement was last amended on September 25, 2019.