SQL Query Cookbook for DuckDB CPG¶
A practical collection of SQL queries for common code analysis tasks on DuckDB CPG.
Table of Contents¶
- Method Queries
- Call Analysis
- Control Flow
- Data Flow
- File Analysis
- Security Patterns
- Code Quality
- Statistics
- Advanced Patterns
- SQL/PGQ Graph Queries
- See Also
Method Queries¶
Find Method by Exact Name¶
SELECT id, name, full_name, filename, line_number, signature
FROM nodes_method
WHERE name = 'authenticate'
LIMIT 10;
Find Methods by Pattern¶
SELECT id, name, full_name, filename, line_number
FROM nodes_method
WHERE name LIKE '%process%'
ORDER BY name
LIMIT 50;
Methods by Signature Pattern¶
SELECT name, full_name, signature, filename
FROM nodes_method
WHERE signature LIKE '%char*%' -- Methods taking char* parameter
ORDER BY name
LIMIT 100;
External vs Internal Methods¶
-- External methods (from libraries)
SELECT name, full_name, COUNT(*) as count
FROM nodes_method
WHERE is_external = true
GROUP BY name, full_name
ORDER BY count DESC
LIMIT 20;
-- Internal methods (your code)
SELECT filename, COUNT(*) as method_count
FROM nodes_method
WHERE is_external = false
GROUP BY filename
ORDER BY method_count DESC
LIMIT 20;
Methods by Line Count¶
SELECT
name,
filename,
line_number,
line_number_end,
(line_number_end - line_number) as lines_of_code
FROM nodes_method
WHERE line_number_end IS NOT NULL
AND (line_number_end - line_number) > 50 -- Long methods
ORDER BY lines_of_code DESC
LIMIT 20;
Call Analysis¶
Direct Callees (What does X call?)¶
SELECT DISTINCT
callee.name,
callee.full_name,
callee.filename,
callee.line_number
FROM edges_call ec
JOIN nodes_call c ON ec.src = c.id
JOIN nodes_method caller ON c.method_full_name LIKE '%' || caller.name || '%'
JOIN nodes_method callee ON ec.dst = callee.id
WHERE caller.name = 'main'
ORDER BY callee.name
LIMIT 100;
Direct Callers (Who calls X?)¶
SELECT DISTINCT
caller.name,
caller.full_name,
caller.filename,
caller.line_number
FROM edges_call ec
JOIN nodes_call c ON ec.src = c.id
JOIN nodes_method callee ON ec.dst = callee.id
JOIN nodes_method caller ON c.method_full_name LIKE '%' || caller.name || '%'
WHERE callee.name = 'malloc'
ORDER BY caller.name
LIMIT 100;
Call Chain (Transitive Callees)¶
WITH RECURSIVE call_chain AS (
-- Base: direct calls from starting method
SELECT
ec.dst as method_id,
1 as depth,
m_start.name as start_method
FROM edges_call ec
JOIN nodes_call c ON ec.src = c.id
JOIN nodes_method m_start ON c.method_full_name LIKE '%' || m_start.name || '%'
WHERE m_start.name = 'main'
UNION ALL
-- Recursive: follow the chain
SELECT
ec2.dst,
cc.depth + 1,
cc.start_method
FROM edges_call ec2
JOIN nodes_call c2 ON ec2.src = c2.id
JOIN nodes_method m2 ON c2.method_full_name LIKE '%' || m2.name || '%'
JOIN call_chain cc ON m2.id = cc.method_id
WHERE cc.depth < 5
)
SELECT DISTINCT
m.name,
m.full_name,
MIN(cc.depth) as min_depth
FROM call_chain cc
JOIN nodes_method m ON cc.method_id = m.id
GROUP BY m.id, m.name, m.full_name
ORDER BY min_depth, m.name
LIMIT 100;
Top Callers (Methods Making Most Calls)¶
SELECT
m.name,
m.full_name,
m.filename,
COUNT(DISTINCT c.id) as outgoing_calls
FROM nodes_method m
LEFT JOIN nodes_call c ON c.method_full_name LIKE '%' || m.name || '%'
GROUP BY m.id, m.name, m.full_name, m.filename
ORDER BY outgoing_calls DESC
LIMIT 20;
Top Callees (Most Called Methods)¶
SELECT
m.name,
m.full_name,
m.filename,
COUNT(ec.src) as incoming_calls
FROM nodes_method m
LEFT JOIN edges_call ec ON m.id = ec.dst
GROUP BY m.id, m.name, m.full_name, m.filename
ORDER BY incoming_calls DESC
LIMIT 20;
Call Pairs (Frequent Caller-Callee Patterns)¶
SELECT
caller.name as caller,
callee.name as callee,
COUNT(*) as frequency
FROM edges_call ec
JOIN nodes_call c ON ec.src = c.id
JOIN nodes_method caller ON c.method_full_name LIKE '%' || caller.name || '%'
JOIN nodes_method callee ON ec.dst = callee.id
GROUP BY caller.name, callee.name
ORDER BY frequency DESC
LIMIT 50;
Leaf Methods (No Outgoing Calls)¶
SELECT
m.name,
m.full_name,
m.filename
FROM nodes_method m
WHERE NOT EXISTS (
SELECT 1
FROM nodes_call c
WHERE c.method_full_name LIKE '%' || m.name || '%'
)
AND m.is_external = false
ORDER BY m.name
LIMIT 100;
Root Methods (Not Called by Anyone)¶
SELECT
m.name,
m.full_name,
m.filename,
m.line_number
FROM nodes_method m
WHERE NOT EXISTS (
SELECT 1
FROM edges_call ec
WHERE ec.dst = m.id
)
AND m.is_external = false
ORDER BY m.filename, m.line_number
LIMIT 100;
Control Flow¶
Methods with Branches¶
-- Find methods containing IF/SWITCH structures via AST
SELECT DISTINCT
m.name,
m.full_name,
m.filename,
COUNT(DISTINCT cs.id) as branch_count
FROM nodes_method m
JOIN edges_ast ast ON m.id = ast.src
JOIN nodes_control_structure cs ON ast.dst = cs.id
WHERE cs.control_structure_type IN ('IF', 'SWITCH')
GROUP BY m.id, m.name, m.full_name, m.filename
ORDER BY branch_count DESC
LIMIT 20;
Methods with Loops¶
-- Find methods containing loop structures via AST
SELECT DISTINCT
m.name,
m.full_name,
m.filename,
COUNT(DISTINCT cs.id) as loop_count
FROM nodes_method m
JOIN edges_ast ast ON m.id = ast.src
JOIN nodes_control_structure cs ON ast.dst = cs.id
WHERE cs.control_structure_type IN ('WHILE', 'FOR', 'DO')
GROUP BY m.id, m.name, m.full_name, m.filename
ORDER BY loop_count DESC
LIMIT 20;
Cyclomatic Complexity Approximation¶
-- Approximate complexity by counting control structures
SELECT
m.name,
m.filename,
1 + COUNT(DISTINCT cs.id) as approx_complexity
FROM nodes_method m
LEFT JOIN edges_ast ast ON m.id = ast.src
LEFT JOIN nodes_control_structure cs ON ast.dst = cs.id
WHERE m.is_external = false
GROUP BY m.id, m.name, m.filename
ORDER BY approx_complexity DESC
LIMIT 30;
Control Flow Graph Traversal (SQL/PGQ)¶
-- Follow CFG edges using property graph
FROM GRAPH_TABLE(cpg
MATCH (start:METHOD)-[:CFG*1..10]->(node:CPG_NODE)
WHERE start.name = 'process_request'
COLUMNS (
start.name AS method_name,
node.id AS node_id,
node.node_type
)
)
LIMIT 100;
Data Flow¶
Variable Flow Paths¶
WITH RECURSIVE var_flow AS (
-- Base: initial definitions
SELECT src, dst, variable, 1 as depth
FROM edges_reaching_def
WHERE variable = 'password'
UNION ALL
-- Recursive: follow the flow
SELECT erd.src, erd.dst, erd.variable, vf.depth + 1
FROM edges_reaching_def erd
JOIN var_flow vf ON erd.src = vf.dst
WHERE vf.depth < 10
)
SELECT
variable,
COUNT(DISTINCT src) as definition_points,
COUNT(DISTINCT dst) as use_points,
MAX(depth) as max_depth
FROM var_flow
GROUP BY variable;
Tainted Parameters¶
-- Methods receiving tainted input
SELECT DISTINCT
m.name,
m.full_name,
p.name as parameter_name,
p.type_full_name as parameter_type
FROM nodes_method m
JOIN nodes_param p ON p.method_full_name = m.full_name
WHERE p.name IN ('userInput', 'request', 'input', 'data')
ORDER BY m.name
LIMIT 50;
File Analysis¶
Methods Per File¶
SELECT
filename,
COUNT(*) as method_count,
SUM(CASE WHEN is_external THEN 1 ELSE 0 END) as external_methods,
SUM(CASE WHEN is_external THEN 0 ELSE 1 END) as internal_methods
FROM nodes_method
GROUP BY filename
ORDER BY method_count DESC
LIMIT 30;
Files by Call Activity¶
SELECT
m.filename,
COUNT(DISTINCT c.id) as total_calls,
COUNT(DISTINCT c.name) as unique_calls
FROM nodes_method m
JOIN nodes_call c ON c.method_full_name LIKE '%' || m.name || '%'
GROUP BY m.filename
ORDER BY total_calls DESC
LIMIT 20;
Cross-File Calls¶
SELECT
caller.filename as from_file,
callee.filename as to_file,
COUNT(*) as call_count
FROM edges_call ec
JOIN nodes_call c ON ec.src = c.id
JOIN nodes_method caller ON c.method_full_name LIKE '%' || caller.name || '%'
JOIN nodes_method callee ON ec.dst = callee.id
WHERE caller.filename != callee.filename
GROUP BY caller.filename, callee.filename
ORDER BY call_count DESC
LIMIT 50;
Security Patterns¶
Memory Management Issues¶
-- Methods that malloc but don't free
SELECT DISTINCT m.name, m.full_name
FROM nodes_method m
WHERE EXISTS (
SELECT 1 FROM nodes_call c
WHERE c.method_full_name LIKE '%' || m.name || '%'
AND c.name = 'malloc'
)
AND NOT EXISTS (
SELECT 1 FROM nodes_call c
WHERE c.method_full_name LIKE '%' || m.name || '%'
AND c.name = 'free'
)
LIMIT 50;
Dangerous Function Calls¶
SELECT
m.name as caller,
m.filename,
c.name as dangerous_function,
m.line_number
FROM nodes_call c
JOIN nodes_method m ON c.method_full_name LIKE '%' || m.name || '%'
WHERE c.name IN ('strcpy', 'sprintf', 'gets', 'system', 'eval')
ORDER BY m.filename, m.line_number
LIMIT 100;
SQL Injection Candidates¶
SELECT DISTINCT
m.name,
m.full_name,
m.filename
FROM nodes_method m
WHERE EXISTS (
-- Has SQL-related calls
SELECT 1 FROM nodes_call c
WHERE c.method_full_name LIKE '%' || m.name || '%'
AND (c.name LIKE '%query%' OR c.name LIKE '%execute%')
)
AND EXISTS (
-- Receives user input
SELECT 1 FROM nodes_param p
WHERE p.method_full_name = m.full_name
AND (p.name LIKE '%input%' OR p.name LIKE '%request%')
)
LIMIT 50;
Authentication Bypass Candidates¶
SELECT
m.name,
m.full_name,
m.filename
FROM nodes_method m
WHERE m.name LIKE '%auth%'
OR m.name LIKE '%login%'
OR m.name LIKE '%verify%'
ORDER BY m.name
LIMIT 100;
Code Quality¶
God Methods (Too Long)¶
SELECT
name,
filename,
line_number,
line_number_end,
(line_number_end - line_number) as loc
FROM nodes_method
WHERE line_number_end IS NOT NULL
AND (line_number_end - line_number) > 100
ORDER BY loc DESC
LIMIT 20;
High Fan-Out (Too Many Calls)¶
SELECT
m.name,
m.filename,
COUNT(DISTINCT c.id) as fan_out
FROM nodes_method m
JOIN nodes_call c ON c.method_full_name LIKE '%' || m.name || '%'
GROUP BY m.id, m.name, m.filename
HAVING COUNT(DISTINCT c.id) > 10
ORDER BY fan_out DESC
LIMIT 20;
High Fan-In (Too Many Callers)¶
SELECT
m.name,
m.filename,
COUNT(DISTINCT ec.src) as fan_in
FROM nodes_method m
JOIN edges_call ec ON m.id = ec.dst
GROUP BY m.id, m.name, m.filename
HAVING COUNT(DISTINCT ec.src) > 10
ORDER BY fan_in DESC
LIMIT 20;
Unused Methods (Potential Dead Code)¶
SELECT
m.name,
m.full_name,
m.filename
FROM nodes_method m
WHERE m.is_external = false
AND NOT EXISTS (
SELECT 1 FROM edges_call ec WHERE ec.dst = m.id
)
AND m.name NOT IN ('main', 'test%') -- Exclude entry points
ORDER BY m.filename, m.name
LIMIT 100;
Statistics¶
CPG Overview¶
SELECT
(SELECT COUNT(*) FROM nodes_method) as total_methods,
(SELECT COUNT(*) FROM nodes_method WHERE is_external = false) as internal_methods,
(SELECT COUNT(*) FROM nodes_call) as total_calls,
(SELECT COUNT(*) FROM edges_call) as call_edges,
(SELECT COUNT(DISTINCT filename) FROM nodes_method) as file_count;
Call Distribution¶
SELECT
call_count_bucket,
COUNT(*) as methods_in_bucket
FROM (
SELECT
m.id,
CASE
WHEN call_count = 0 THEN '0 calls'
WHEN call_count <= 5 THEN '1-5 calls'
WHEN call_count <= 10 THEN '6-10 calls'
WHEN call_count <= 20 THEN '11-20 calls'
ELSE '20+ calls'
END as call_count_bucket
FROM nodes_method m
LEFT JOIN (
SELECT
m2.id,
COUNT(c.id) as call_count
FROM nodes_method m2
LEFT JOIN nodes_call c ON c.method_full_name LIKE '%' || m2.name || '%'
GROUP BY m2.id
) counts ON m.id = counts.id
)
GROUP BY call_count_bucket
ORDER BY
CASE call_count_bucket
WHEN '0 calls' THEN 1
WHEN '1-5 calls' THEN 2
WHEN '6-10 calls' THEN 3
WHEN '11-20 calls' THEN 4
ELSE 5
END;
Language Distribution (by filename extension)¶
SELECT
CASE
WHEN filename LIKE '%.c' THEN 'C'
WHEN filename LIKE '%.cpp' OR filename LIKE '%.cc' THEN 'C++'
WHEN filename LIKE '%.java' THEN 'Java'
WHEN filename LIKE '%.py' THEN 'Python'
WHEN filename LIKE '%.js' THEN 'JavaScript'
ELSE 'Other'
END as language,
COUNT(*) as method_count
FROM nodes_method
WHERE is_external = false
GROUP BY language
ORDER BY method_count DESC;
Method Signature Complexity¶
SELECT
m.name,
m.signature,
LENGTH(m.signature) as sig_length,
(LENGTH(m.signature) - LENGTH(REPLACE(m.signature, ',', ''))) + 1 as param_count
FROM nodes_method m
WHERE m.signature IS NOT NULL
AND m.is_external = false
ORDER BY param_count DESC, sig_length DESC
LIMIT 30;
Advanced Patterns¶
Find Method Chains (A → B → C)¶
WITH chain AS (
SELECT
caller.name as method_a,
intermediate.name as method_b,
callee.name as method_c
FROM edges_call ec1
JOIN nodes_call c1 ON ec1.src = c1.id
JOIN nodes_method caller ON c1.method_full_name LIKE '%' || caller.name || '%'
JOIN nodes_method intermediate ON ec1.dst = intermediate.id
JOIN nodes_call c2 ON c2.method_full_name LIKE '%' || intermediate.name || '%'
JOIN edges_call ec2 ON c2.id = ec2.src
JOIN nodes_method callee ON ec2.dst = callee.id
WHERE caller.name = 'main'
)
SELECT DISTINCT method_a, method_b, method_c
FROM chain
LIMIT 100;
Connected Components (Method Clusters)¶
-- Find methods in same call graph
WITH RECURSIVE component AS (
SELECT id, name, id as root
FROM nodes_method
WHERE name = 'main'
UNION
SELECT m.id, m.name, c.root
FROM nodes_method m
JOIN edges_call ec ON m.id = ec.dst OR m.id IN (
SELECT m2.id FROM nodes_method m2
JOIN nodes_call nc ON nc.method_full_name LIKE '%' || m2.name || '%'
WHERE nc.id = ec.src
)
JOIN component c ON ec.dst = c.id OR ec.src = c.id
)
SELECT
root,
COUNT(DISTINCT id) as component_size,
STRING_AGG(DISTINCT name, ', ') as methods
FROM component
GROUP BY root
ORDER BY component_size DESC
LIMIT 10;
Usage Tips¶
- Always add LIMIT to prevent returning millions of rows
- Use specific columns instead of SELECT *
- Test on small datasets first
- Add WHERE clauses to filter irrelevant results
- Use EXPLAIN to check query plans
- Monitor execution time (should be < 100ms for most queries)
Performance Notes¶
- Simple lookups: 1-3 ms
- JOINs (1-2 tables): 2-5 ms
- Aggregations: 4-8 ms
- Recursive CTEs: 10-50 ms (depends on depth)
All queries tested on sample database with 5 methods, 4 calls. Performance scales sub-linearly with data size due to indexes.
SQL/PGQ Graph Queries¶
DuckDB’s SQL/PGQ extension enables intuitive graph traversal queries.
Call Graph Traversal¶
-- Find all methods called by main (direct and indirect)
FROM GRAPH_TABLE(cpg
MATCH (caller:METHOD)-[:CALLS*1..3]->(callee:METHOD)
WHERE caller.name = 'main'
COLUMNS (
caller.name AS caller,
callee.name AS callee,
callee.filename
)
)
LIMIT 100;
Data Flow Analysis¶
-- Track data flow through reaching definitions
FROM GRAPH_TABLE(cpg
MATCH (src:IDENTIFIER)-[:REACHING_DEF*1..5]->(sink:CALL_NODE)
WHERE src.name = 'user_input'
COLUMNS (
src.name AS source_var,
sink.name AS sink_function,
sink.line_number
)
)
LIMIT 100;
AST Traversal¶
-- Find all nodes under a method in AST
FROM GRAPH_TABLE(cpg
MATCH (method:METHOD)-[:AST*1..5]->(child:CPG_NODE)
WHERE method.name = 'authenticate'
COLUMNS (
method.name AS method_name,
child.id AS child_id,
child.node_type
)
)
LIMIT 100;
Type Hierarchy¶
-- Find inheritance relationships
FROM GRAPH_TABLE(cpg
MATCH (derived:TYPE_DECL)-[:INHERITS_FROM]->(base:TYPE_NODE)
COLUMNS (
derived.name AS derived_type,
base.full_name AS base_type
)
)
Next Steps¶
- Try these queries on your CPG database
- Modify parameters to match your needs
- Combine patterns for complex analysis
- Monitor performance and optimize as needed
- Add your own patterns to this cookbook!
See Also¶
- Analysis Modules - CFGAnalyzer, FieldSensitiveTracer documentation
- CPG Export Guide - Export CPG to DuckDB
- Hypothesis System - Security hypothesis generation
- DuckDB SQL/PGQ Documentation
- CPG Specification v1.1