> ## Documentation Index
> Fetch the complete documentation index at: https://docs.cekura.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Knowledge Base Connectors

> Connect external data sources to your AI agents using knowledge base connectors. Automatically sync content from websites, databases, and other sources to keep your agent's knowledge up to date.

export const CopyPageButton = () => {
  if (typeof window !== 'undefined') {
    setTimeout(function () {
      if (document.getElementById('ck-tools')) return;
      var anchor = document.getElementById('content-area') || document.querySelector('.mdx-content');
      if (!anchor) return;
      if (!document.getElementById('ck-style')) {
        var s = document.createElement('style');
        s.id = 'ck-style';
        s.textContent = '#ck-tools{position:absolute;top:6px;right:0;z-index:100;font-family:inherit;}' + '.ck-row{display:inline-flex;align-items:stretch;border:1px solid rgba(0,0,0,0.15);border-radius:8px;overflow:hidden;background:#fff;}' + ':root.dark .ck-row{background:rgba(255,255,255,0.06);border-color:rgba(255,255,255,0.12);}' + '.ck-btn{padding:5px 12px;border:none;background:none;cursor:pointer;font-size:13px;font-weight:500;font-family:inherit;color:#374151;}' + ':root.dark .ck-btn{color:#d1d5db;}' + '.ck-btn:hover{background:rgba(0,0,0,0.04);}' + ':root.dark .ck-btn:hover{background:rgba(255,255,255,0.06);}' + '.ck-chevron{padding:5px 8px;border:none;background:none;cursor:pointer;font-size:14px;font-family:inherit;color:#374151;}' + ':root.dark .ck-chevron{color:#d1d5db;}' + '.ck-chevron:hover{background:rgba(0,0,0,0.04);}' + ':root.dark .ck-chevron:hover{background:rgba(255,255,255,0.06);}' + '.ck-divider{width:1px;background:rgba(0,0,0,0.12);flex-shrink:0;}' + ':root.dark .ck-divider{background:rgba(255,255,255,0.12);}' + '.ck-dd{position:absolute;top:calc(100% + 4px);right:0;min-width:180px;background:#fff;border:1px solid rgba(0,0,0,0.12);border-radius:8px;box-shadow:0 4px 12px rgba(0,0,0,0.1);padding:4px;display:none;z-index:200;}' + ':root.dark .ck-dd{background:#1f2937;border-color:rgba(255,255,255,0.1);box-shadow:0 4px 16px rgba(0,0,0,0.35);}' + '.ck-item{display:block;width:100%;padding:7px 12px;border:none;background:none;border-radius:6px;cursor:pointer;font-size:13px;font-family:inherit;text-align:left;color:#374151;}' + ':root.dark .ck-item{color:#d1d5db;}' + '.ck-item:hover{background:rgba(0,0,0,0.05);}' + ':root.dark .ck-item:hover{background:rgba(255,255,255,0.07);}';
        document.head.appendChild(s);
      }
      var wrap = document.createElement('div');
      wrap.id = 'ck-tools';
      var row = document.createElement('div');
      row.className = 'ck-row';
      var mainBtn = document.createElement('button');
      mainBtn.className = 'ck-btn';
      mainBtn.textContent = 'Copy page';
      var divider = document.createElement('span');
      divider.className = 'ck-divider';
      var chevron = document.createElement('button');
      chevron.className = 'ck-chevron';
      chevron.textContent = '▾';
      var dd = document.createElement('div');
      dd.className = 'ck-dd';
      function closeDD() {
        dd.style.display = 'none';
      }
      function openDD() {
        dd.style.display = 'block';
      }
      chevron.onclick = function (e) {
        e.stopPropagation();
        if (dd.style.display === 'block') {
          closeDD();
        } else {
          openDD();
        }
      };
      document.addEventListener('click', function (e) {
        if (!e.target.closest('#ck-tools')) {
          closeDD();
        }
      });
      document.addEventListener('keydown', function (e) {
        if (e.key === 'Escape') {
          closeDD();
        }
      });
      function makeItem(label, fn) {
        var b = document.createElement('button');
        b.className = 'ck-item';
        b.textContent = label;
        b.onclick = function () {
          fn();
          closeDD();
        };
        return b;
      }
      function getMarkdown() {
        var walk = function (node) {
          if (!node) return '';
          if (node.nodeType === 3) return node.textContent || '';
          if (node.nodeType !== 1) return '';
          var tag = node.tagName.toLowerCase();
          var skip = ['script', 'style', 'svg', 'noscript', 'button', 'iframe'];
          if (skip.indexOf(tag) !== -1) return '';
          if (node.id === 'ck-tools') return '';
          var ch = Array.from(node.childNodes).map(walk).join('');
          if (tag === 'h1') return '\n# ' + ch.trim() + '\n\n';
          if (tag === 'h2') return '\n## ' + ch.trim() + '\n\n';
          if (tag === 'h3') return '\n### ' + ch.trim() + '\n\n';
          if (tag === 'p') return '\n' + ch.trim() + '\n\n';
          if (tag === 'pre') return '\n```\n' + node.textContent.trim() + '\n```\n\n';
          if (tag === 'li') return '- ' + ch.trim() + '\n';
          if (tag === 'code') return '`' + ch.trim() + '`';
          return ch;
        };
        var content = document.querySelector('.mdx-content') || document.getElementById('content-area') || document.body;
        return walk(content).replace(/\n\n\n+/g, '\n\n').trim();
      }
      function copyMd() {
        var md = getMarkdown();
        navigator.clipboard.writeText(md).then(function () {
          mainBtn.textContent = 'Copied!';
          setTimeout(function () {
            mainBtn.textContent = 'Copy page';
          }, 2000);
        });
      }
      function viewMd() {
        var md = getMarkdown();
        var safe = md.split('&').join('&amp;').split('<').join('&lt;').split('>').join('&gt;');
        var html = '<!DOCTYPE html><html><head><meta charset="utf-8"><style>body{font-family:monospace;max-width:860px;margin:40px auto;padding:0 24px;line-height:1.7;white-space:pre-wrap;word-wrap:break-word}</style></head><body>' + safe + '</body></html>';
        window.open(URL.createObjectURL(new Blob([html], {
          type: 'text/html'
        })), '_blank');
      }
      function openClaude() {
        var prompt = 'Can you read this Cekura docs page ' + window.location.href + ' so I can ask you questions?';
        window.open('https://claude.ai/new?q=' + encodeURIComponent(prompt), '_blank');
      }
      mainBtn.onclick = copyMd;
      dd.appendChild(makeItem('Copy page', copyMd));
      dd.appendChild(makeItem('View as Markdown', viewMd));
      dd.appendChild(makeItem('Open in Claude', openClaude));
      row.appendChild(mainBtn);
      row.appendChild(divider);
      row.appendChild(chevron);
      wrap.appendChild(row);
      wrap.appendChild(dd);
      anchor.style.position = 'relative';
      anchor.insertBefore(wrap, anchor.firstChild);
    }, 50);
  }
  return null;
};

<CopyPageButton />

## Overview

Knowledge base connectors allow you to automatically sync content from external sources into your AI agent's knowledge base. Instead of manually uploading files, connectors can fetch and update content programmatically, ensuring your agents always have access to the latest information.

Each connector can create and manage multiple knowledge base files, automatically keeping them in sync with the source data. When a connector syncs, it updates all associated files, and when deleted, it cleans up all files it created.

## Available Connectors

### Website Scraper Connector

The Website Scraper connector fetches and extracts content from web pages, making it easy to keep your agent informed about documentation, help articles, or any web-based content.

Depending on the configuration, the Website Scraper can create either a single consolidated file or multiple files (one per page) from the scraped content.

#### Use Cases

* **Documentation Sites**: Keep your agent updated with the latest product documentation
* **Help Centers**: Sync FAQ pages and support articles
* **Blog Posts**: Include recent blog content in your agent's knowledge
* **Company Pages**: Pull content from About Us, Terms of Service, or other key pages

#### Configuration

To use the Website Scraper connector, you need to configure it with the following parameters:

##### Required Parameters

| Parameter | Type   | Description                                        |
| --------- | ------ | -------------------------------------------------- |
| `url`     | string | The web page URL to scrape (must be HTTP or HTTPS) |

##### Optional Parameters

| Parameter   | Type    | Default | Description                                                 |
| ----------- | ------- | ------- | ----------------------------------------------------------- |
| `timeout`   | integer | 30      | Request timeout in seconds                                  |
| `selectors` | object  | null    | Custom CSS or tag selectors for targeted content extraction |

##### Credentials (Optional)

| Field     | Type   | Description                                                                  |
| --------- | ------ | ---------------------------------------------------------------------------- |
| `headers` | object | Custom HTTP headers for authenticated requests (e.g., API keys, auth tokens) |

#### Basic Example

```json theme={null}
{
  "connector_type": "website",
  "config": {
    "url": "https://docs.example.com/api-guide"
  },
  "credentials": {}
}
```

#### Advanced Example with Selectors

For more control over what content gets extracted, you can specify custom selectors:

```json theme={null}
{
  "connector_type": "website",
  "config": {
    "url": "https://docs.example.com/api-guide",
    "timeout": 60,
    "selectors": {
      "Main Content": {
        "selector": "article.documentation",
        "type": "css"
      },
      "Code Examples": {
        "selector": "pre.code-block",
        "type": "css"
      },
      "Headers": {
        "selector": "h2",
        "type": "tag"
      }
    }
  },
  "credentials": {}
}
```

#### Authenticated Requests Example

If the website requires authentication or custom headers:

```json theme={null}
{
  "connector_type": "website",
  "config": {
    "url": "https://internal-docs.example.com/guide"
  },
  "credentials": {
    "headers": {
      "Authorization": "Bearer your-api-token",
      "X-Custom-Header": "custom-value"
    }
  }
}
```

#### Content Extraction Behavior

The Website Scraper automatically:

1. **Validates URLs**: Only HTTP/HTTPS schemes are allowed, and private IPs/localhost are blocked for security
2. **Cleans Content**: Removes script tags, styles, navigation, footers, and other non-content elements
3. **Formats Text**: Extracts clean, readable text with proper line breaks
4. **Manages Files**: Creates and updates knowledge base files, maintaining associations with the connector

**Default Extraction Strategy** (when no selectors are provided):

* First tries to find an `<article>` tag
* Falls back to `<main>` or `<div class="content">`
* If neither exists, extracts all text from `<body>`

**Custom Selectors** (when provided):

* Extracts content matching each selector
* Supports both CSS selectors and HTML tag names
* Each section is labeled with the selector name

**Multi-Page Support**:

* When configured for multi-page scraping, the connector creates separate knowledge base files for each page
* All files are automatically tracked and managed by the connector
* Subsequent syncs update all associated files

#### Security Features

The Website Scraper includes built-in protections against SSRF (Server-Side Request Forgery) attacks:

* Blocks requests to private IP ranges (10.x.x.x, 172.16.x.x, 192.168.x.x)
* Blocks localhost and loopback addresses
* Blocks link-local addresses (e.g., AWS metadata service at 169.254.169.254)
* Only allows HTTP and HTTPS protocols

#### Limitations

* Minimum content length: 100 characters (pages with less content will fail)
* Does not execute JavaScript (static HTML only)
* Cannot handle pages requiring complex authentication flows
* Cannot scrape content behind CAPTCHAs or bot protection

### BigQuery Connector

<Note>
  Documentation for the BigQuery connector coming soon.
</Note>

## When to Use Connectors vs. File Uploads

| Scenario                      | Recommended Approach                         |
| ----------------------------- | -------------------------------------------- |
| Content changes frequently    | Use connectors with scheduled syncing        |
| Static documents (PDFs, docs) | Direct file upload                           |
| Web-based documentation       | Website Scraper connector                    |
| Database queries              | BigQuery connector                           |
| One-time knowledge addition   | Direct file upload                           |
| Multiple related web pages    | Website Scraper with multiple configurations |

## Best Practices

1. **Start Simple**: Begin with basic URL configuration, then add selectors if needed
2. **Test Selectors**: Use browser dev tools to test CSS selectors before configuring
3. **Set Appropriate Timeouts**: Increase timeout for slow-loading pages
4. **Monitor Content Length**: Ensure scraped content meets the 100-character minimum
5. **Schedule Regular Syncs**: Keep knowledge base fresh by scheduling periodic syncs
6. **Use Specific Selectors**: Target main content areas to avoid extracting navigation and footers
7. **Review Synced Files**: Check that all expected files are being created and updated correctly

## Troubleshooting

### "Scraped content appears empty or too short"

* Check if the URL is correct and publicly accessible
* Verify selectors are matching the expected elements
* Try removing custom selectors to use default extraction
* Check if the page requires JavaScript (not supported)

### "URL validation failed"

* Ensure the URL uses HTTP or HTTPS
* Check that the URL doesn't point to a private IP or localhost
* Verify the hostname can be resolved

### "Network error while fetching URL"

* Increase the timeout value for slow-loading pages
* Check if the website requires authentication headers
* Verify the URL is accessible from your network

## API Reference

For programmatic access to knowledge base management, see:

* [Update Agent](/api-reference/test_framework/update-agent-partial) — manage knowledge base files via the `knowledge_base_files` field (reference existing files by `id`, add new ones with `file_name` + `content_base64`)

## Next Steps

* Set up your first connector
* Schedule automated syncs
* Monitor sync status and errors
* Combine multiple connectors for comprehensive knowledge bases